Machine Learning Model for predicting traffic flow/density

In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

Preprocessing and Feature Selection

In [3]:
df = pd.read_csv('Traffic.csv')

# Convert 'Time' to minutes
df['Time'] = pd.to_datetime(df['Time'], format='%I:%M:%S %p').dt.hour * 60 + pd.to_datetime(df['Time'], format='%I:%M:%S %p').dt.minute

# Preprocessing columns
numeric_features = ['Time', 'Date', 'CarCount', 'BikeCount', 'BusCount', 'TruckCount']
categorical_features = ['Day of the week']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),  # Include 'Time' in numeric features
        ('cat', OneHotEncoder(), categorical_features)
    ])


X = df.drop("Total", axis='columns')  # Features
y = df["Total"]  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

Fit the Model to the dataset

In [5]:
# Create a pipeline with preprocessing and Linear Regression model
model = LinearRegression()

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

Make predictions on the testing dataset, output MAE performance metric

In [9]:
# Use the pipeline to make predictions on the testing set
predictions = pipeline.predict(X_test)

# Evaluate the model performance
mae = mean_absolute_error(y_test, predictions)

print(df.head())
# Display the evaluation metrics
print(f"Mean Absolute Error: {mae}") ##extermely small and means it is very close to actual values in testing set.


      Actual  Predicted
2404      30       30.0
2866     139      139.0
2775     137      137.0
507      162      162.0
1825      43       43.0
Mean Absolute Error: 3.9481669147162565e-14


In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Step 1: Data Preprocessing
data = pd.read_csv('Traffic.csv')

# Handle missing data if any
# Convert time to numerical format
data['Time'] = pd.to_datetime(data['Time']).dt.hour * 60 + pd.to_datetime(data['Time']).dt.minute

# Convert categorical features to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Day of the week'])

# Step 2: Feature Engineering
# Extract relevant features, normalize numerical features if needed

# Step 3: Splitting Data
X = data.drop(['Traffic Situation'], axis=1)
y = data['Traffic Situation']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Training
# Convert categorical labels to numerical values
label_mapping = {'low': 0, 'medium': 1, 'high': 2}
data['Traffic Situation'] = data['Traffic Situation'].map(label_mapping)

# Rest of the code remains the same...
# (Steps 1-6 as provided in the previous responses)

# Continue with the rest of the code
# Step 4: Model Training
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Step 5: Prediction
y_pred = model.predict(X_test)

# Step 6: Generate Heat Map
# Assuming you want to create a heat map for predicted vs actual values
heatmap_data = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
sns.heatmap(heatmap_data, annot=True, cmap='viridis', fmt=".2f")
plt.show()

ValueError: could not convert string to float: 'low'