# Traffic Prediction
https://www.kaggle.com/datasets/hasibullahaman/traffic-prediction-dataset

### Import Basic Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv("Traffic.csv")

In [3]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,12:00:00 AM,10,Tuesday,31,0,4,4,39,low
1,12:15:00 AM,10,Tuesday,49,0,3,3,55,low
2,12:30:00 AM,10,Tuesday,46,0,3,6,55,low
3,12:45:00 AM,10,Tuesday,51,0,2,5,58,low
4,1:00:00 AM,10,Tuesday,57,6,15,16,94,normal


In [4]:
#To change, the times from categorical to numerical in 24hr format
df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute
df['Time'] = df['Time'] / 60.0

In [5]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,0.0,10,Tuesday,31,0,4,4,39,low
1,0.25,10,Tuesday,49,0,3,3,55,low
2,0.5,10,Tuesday,46,0,3,6,55,low
3,0.75,10,Tuesday,51,0,2,5,58,low
4,1.0,10,Tuesday,57,6,15,16,94,normal


In [6]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
print(missing_percentage)

Time                 0.0
Date                 0.0
Day of the week      0.0
CarCount             0.0
BikeCount            0.0
BusCount             0.0
TruckCount           0.0
Total                0.0
Traffic Situation    0.0
dtype: float64


* Great! no missing data in the Dataset

In [7]:
day_of_week_mapping = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}
df['Day of the week'].replace(day_of_week_mapping, inplace=True)

traffic_situation_mapping = {'low': 0, 'normal': 1, 'high': 2, 'heavy':3}
df['Traffic Situation'].replace(traffic_situation_mapping, inplace=True)

In [8]:
df.head()

Unnamed: 0,Time,Date,Day of the week,CarCount,BikeCount,BusCount,TruckCount,Total,Traffic Situation
0,0.0,10,2,31,0,4,4,39,0
1,0.25,10,2,49,0,3,3,55,0
2,0.5,10,2,46,0,3,6,55,0
3,0.75,10,2,51,0,2,5,58,0
4,1.0,10,2,57,6,15,16,94,1


### Feature Selection

In [9]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

# Define X (features) and y (target)
X = df.drop(columns=['Traffic Situation'])  # Features
y = df['Traffic Situation']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply feature selection using SelectKBest with chi-squared test

k = 5 # Replace 'k' with the desired number of features to select
selector = SelectKBest(score_func=chi2, k=k)
X_new = selector.fit_transform(X_train, y_train)

# Get the indices of the selected features
selected_feature_indices = selector.get_support(indices=True)

# Print the selected feature indices
print("Selected Feature Indices:", selected_feature_indices)

# Now, 'X_new' contains the selected features.


Selected Feature Indices: [3 4 5 6 7]


In [10]:
df.columns

Index(['Time', 'Date', 'Day of the week', 'CarCount', 'BikeCount', 'BusCount',
       'TruckCount', 'Total', 'Traffic Situation'],
      dtype='object')

In [11]:
X = df[['CarCount', 'BusCount','BikeCount','TruckCount','Total']]
y = df['Traffic Situation']

### Modelling

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

RForest_clf = RandomForestClassifier(n_estimators = 100)
RForest_clf.fit(X_train, y_train)
y_pred = RForest_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print(f'Accuracy: {accuracy}')

Accuracy: 98.99328859060402


In [14]:
import xgboost as xgb

xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train,y_train)
y_pred = xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)*100
print(f'Accuracy: {accuracy}')

Accuracy: 100.0
