In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [113]:
df = pd.read_csv("Metro_Interstate_Traffic_Volume.csv")

In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


In [115]:
#here we drop the holiday and weather description column because holiday column is full of null values and
#weather description column concists of the description elaboration of the weather_main column

df.drop(columns = ["holiday",'weather_description'],axis = 1, inplace = True)

In [116]:
df = df[df["weather_main"].isin(["Clouds", "Clear", "Mist", "Rain", "Snow", "Drizzle", "Haze", "Thunderstorm", "Fog"])]


In [117]:
df['weather_main'].value_counts()

weather_main
Clouds          15164
Clear           13391
Mist             5950
Rain             5672
Snow             2876
Drizzle          1821
Haze             1360
Thunderstorm     1034
Fog               912
Name: count, dtype: int64

In [118]:
df['date_time'] = pd.to_datetime(df['date_time'], format='%Y-%m-%d %H:%M:%S')

# Extract Date Features
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['day_of_week'] = df['date_time'].dt.dayofweek
df['day_of_year'] = df['date_time'].dt.dayofyear

# Extract Time Features
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute


# Create a new DataFrame for the modified 'date_time' column
new_df = df.copy()

# Drop the 'date_time' column from the original DataFrame
df.drop(columns=['date_time'], inplace=True)

In [119]:
df

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,weather_main,traffic_volume,year,month,day,day_of_week,day_of_year,hour,minute
0,288.28,0.0,0.0,40,Clouds,5545,2012,10,2,1,276,9,0
1,289.36,0.0,0.0,75,Clouds,4516,2012,10,2,1,276,10,0
2,289.58,0.0,0.0,90,Clouds,4767,2012,10,2,1,276,11,0
3,290.13,0.0,0.0,90,Clouds,5026,2012,10,2,1,276,12,0
4,291.14,0.0,0.0,75,Clouds,4918,2012,10,2,1,276,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,283.45,0.0,0.0,75,Clouds,3543,2018,9,30,6,273,19,0
48200,282.76,0.0,0.0,90,Clouds,2781,2018,9,30,6,273,20,0
48201,282.73,0.0,0.0,90,Thunderstorm,2159,2018,9,30,6,273,21,0
48202,282.09,0.0,0.0,90,Clouds,1450,2018,9,30,6,273,22,0


In [120]:
df = pd.get_dummies(df, columns=['weather_main'], prefix='', prefix_sep='')

In [121]:
x = df.drop("traffic_volume",axis = 1)
y = df["traffic_volume"]

In [122]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2,random_state=43)

In [142]:
from sklearn.ensemble import RandomForestRegressor 

model = RandomForestRegressor()

In [143]:
model.fit(x_train,y_train)

In [144]:
model.score(x_test,y_test)

0.9697497057677108

In [132]:
y_pred = model.predict(x_test)

In [127]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mae

189.29616334578665

In [128]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

121100.96918149646

In [129]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.9693994351185767

In [130]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

347.99564534846763

In [147]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor



models = [RandomForestRegressor,LinearRegression,Ridge,Lasso,ElasticNet,
          DecisionTreeRegressor,SVR,KNeighborsRegressor]
    
    
def find_best_model():
        
    model_score = {}
    Highest_score = 0
    
    for i in models:
        model = i()
        model.fit(x_train,y_train)
        print(f"The score of algorithm {i} is {model.score(x_test,y_test)}")
        
        model_score[i] = model.score(x_test,y_test)
        
        
        if model.score(x_test,y_test) > Highest_score:
            Highest_score = model.score(x_test,y_test)
        
        
    
    
    
    

In [148]:
find_best_model()

The score of algorithm <class 'sklearn.ensemble._forest.RandomForestRegressor'> is 0.9696880789288935
The score of algorithm <class 'sklearn.linear_model._base.LinearRegression'> is 0.16753529320597382
The score of algorithm <class 'sklearn.linear_model._ridge.Ridge'> is 0.167565019372502
The score of algorithm <class 'sklearn.linear_model._coordinate_descent.Lasso'> is 0.1677762035907594
The score of algorithm <class 'sklearn.linear_model._coordinate_descent.ElasticNet'> is 0.16413358903194286
The score of algorithm <class 'sklearn.tree._classes.DecisionTreeRegressor'> is 0.9415278447479396
The score of algorithm <class 'sklearn.svm._classes.SVR'> is -0.003537775078720662
The score of algorithm <class 'sklearn.neighbors._regression.KNeighborsRegressor'> is 0.7825510013332226
