Histogram

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# โหลด Dataset
file_path = "E:/JOB/Task2_Clean/cleaned_dataset.csv"  # ใส่ path ของไฟล์
data = pd.read_csv(file_path)

# ตรวจสอบคอลัมน์ Delay ว่ามีใน DataFrame หรือไม่
if 'Delay' in data.columns:
    # ลบค่า NaN และแปลงเป็นตัวเลขหากจำเป็น
    data['Delay'] = pd.to_numeric(data['Delay'], errors='coerce')
    data = data.dropna(subset=['Delay'])
    
    # วาดกราฟ Histogram
    plt.figure(figsize=(10, 6))
    sns.histplot(data['Delay'], bins=30, kde=True, color="blue")
    plt.title("Distribution of Delay")
    plt.xlabel("Delay (Minutes)")
    plt.ylabel("Frequency")
    plt.show()
else:
    print("Error: Column 'Delay' not found in the dataset.")


Linear Regression

In [None]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# โหลดข้อมูล
data = pd.read_csv("E:/JOB/Task2_Clean/cleaned_dataset.csv")

# แปลงข้อมูล Categorical (Crossing Types และ Lane Types) เป็นตัวเลข
label_encoder = LabelEncoder()
data['Crossing Types'] = label_encoder.fit_transform(data['Crossing Types'])
data['Lane Types'] = label_encoder.fit_transform(data['Lane Types'])

# เลือก Features และ Target
X = data[['Crossing Types', 'Lane Types']]  # Features
y = data['Delay']  # Target

# แบ่งข้อมูลเป็น Training และ Test Set (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# สร้างโมเดล Linear Regression
model = LinearRegression()

# เทรนโมเดล
model.fit(X_train, y_train)

# ทำนายค่าของ Test Set
y_pred = model.predict(X_test)

# ประเมินผลโมเดล
mse = mean_squared_error(y_test, y_pred)  # คำนวณ Mean Squared Error
r2 = r2_score(y_test, y_pred)  # คำนวณ R-squared

# แสดงผลลัพธ์
print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)

# แสดง Coefficients ของโมเดล
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)


Box plot

In [None]:
sns.boxplot(x='Crossing Types', y='Delay', data=data)
plt.title('Impact of Crossing Types on Delay')
plt.show()

sns.boxplot(x='Lane Types', y='Delay', data=data)
plt.title('Impact of Lane Types on Delay')
plt.show()


In [None]:
# --- STEP 1: Import Libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

# --- STEP 2: Load Dataset ---
# Assuming the dataset is loaded into 'data'
data = pd.read_csv('E:/JOB/Task2_Clean/cleaned_dataset.csv')

# --- STEP 3: Data Preprocessing ---
# Check necessary columns
required_columns = ['Crossing Types', 'Lane Types', 'Delay']
if not all(col in data.columns for col in required_columns):
    raise ValueError("The dataset does not contain all required columns.")

# Drop rows with missing values
data = data.dropna(subset=required_columns)

# Convert categorical data to numeric using LabelEncoder
label_encoder = LabelEncoder()
data['Crossing Types'] = label_encoder.fit_transform(data['Crossing Types'])
data['Lane Types'] = label_encoder.fit_transform(data['Lane Types'])

# Categorize Delay into classes (Low, Medium, High)
bins = [0, 10, 20, float('inf')]
labels = [0, 1, 2]  # 0 = Low, 1 = Medium, 2 = High
data['Delay_Category'] = pd.cut(data['Delay'], bins=bins, labels=labels, include_lowest=True)

# Drop rows with NaN in Delay_Category
data = data.dropna(subset=['Delay_Category'])

# Features (X) and Target (y)
X = data[['Crossing Types', 'Lane Types']]
y = data['Delay_Category']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- STEP 4: Train Random Forest Model ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# --- STEP 5: Evaluate the Model ---
y_pred = rf_model.predict(X_test)

# Classification Report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\n--- Confusion Matrix ---")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

# --- STEP 6: Visualize Confusion Matrix ---
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix Heatmap")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# --- STEP 7: Feature Importance ---
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Plot Feature Importance
plt.figure(figsize=(8, 6))
sns.barplot(x=feature_importances['Importance'], y=feature_importances['Feature'], palette='viridis')
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()
