In [5]:
#Import the required libraries

import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [2]:
#Load the dataset into a Pandas DataFrame

df = pd.read_csv('Dataset_Uber Traffic.csv')

In [3]:
#Display the first few rows of the dataset

print(df.head())

        DateTime  Junction  Vehicles           ID
0  01/11/15 0:00         1        15  20151101001
1  01/11/15 1:00         1        13  20151101011
2  01/11/15 2:00         1        10  20151101021
3  01/11/15 3:00         1         7  20151101031
4  01/11/15 4:00         1         9  20151101041


In [4]:
#Summary statistics

print(df.describe())

           Junction      Vehicles            ID
count  48120.000000  48120.000000  4.812000e+04
mean       2.180549     22.791334  2.016330e+10
std        0.966955     20.750063  5.944854e+06
min        1.000000      1.000000  2.015110e+10
25%        1.000000      9.000000  2.016042e+10
50%        2.000000     15.000000  2.016093e+10
75%        3.000000     29.000000  2.017023e+10
max        4.000000    180.000000  2.017063e+10


In [5]:
#Information about the dataset

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DateTime  48120 non-null  object
 1   Junction  48120 non-null  int64 
 2   Vehicles  48120 non-null  int64 
 3   ID        48120 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB
None


In [6]:
#Check for missing values

print(df.isnull().sum())

DateTime    0
Junction    0
Vehicles    0
ID          0
dtype: int64


In [7]:
#Since there are no missing values, we proceed with cleaning the dataset.
#Identify duplicates

print(df.duplicated().sum())

0


In [8]:
#Since there are no duplicate values, we proceed with correcting the datatypes.
#Identify datatypes

df.dtypes

DateTime    object
Junction     int64
Vehicles     int64
ID           int64
dtype: object

In [11]:
#To aggregate traffic data-
#Step 1: Parse DateTime column

df['DateTime'] = pd.to_datetime(df['DateTime'])

In [12]:
#Step 2: Set DateTime as index

df.set_index('DateTime', inplace=True)

In [14]:
#Step 3: Group by Junction and resample the data to hourly intervals, aggregating the relevant data 
#(vehicle counts, speeds, and congestion levels)

hourly_traffic = df.groupby('Junction').resample('H').sum()

In [16]:
#Step 4: Reset the index to have a clean DataFrame

hourly_traffic = hourly_traffic.reset_index(drop=True)

print(hourly_traffic)

       Junction  Vehicles            ID
0           1.0      15.0  2.015110e+10
1           1.0      13.0  2.015110e+10
2           1.0      10.0  2.015110e+10
3           1.0       7.0  2.015110e+10
4           1.0       9.0  2.015110e+10
...         ...       ...           ...
84547       4.0      10.0  2.017061e+10
84548       4.0       7.0  2.017061e+10
84549       4.0       8.0  2.017061e+10
84550       4.0      11.0  2.017061e+10
84551       4.0      11.0  2.017061e+10

[84552 rows x 3 columns]


In [19]:
#Extract the numerical columns to be normalized or standardized

numerical_columns = ['Junction', 'Vehicles', 'ID']

#Normalize the data

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df[numerical_columns])
df_normalized = pd.DataFrame(normalized_data, columns=numerical_columns)
print(df_normalized.head())

   Junction  Vehicles            ID
0       0.0  0.078212  0.000000e+00
1       0.0  0.067039  5.120530e-07
2       0.0  0.050279  1.024106e-06
3       0.0  0.033520  1.536159e-06
4       0.0  0.044693  2.048212e-06


In [25]:
#Creating new features from raw data
#Step 1: Time-Based Features

df['DateTime'] = df.index

#Now, let's proceed with extracting time-based features
df['Hour'] = df['DateTime'].dt.hour
df['DayOfWeek'] = df['DateTime'].dt.dayofweek  # Monday=0, Sunday=6
df['Month'] = df['DateTime'].dt.month

In [26]:
#Step 2: Lag Features (Example: Previous Hour Traffic Data)

df['PreviousHourVehicles'] = df['Vehicles'].shift(1)

In [27]:
#Step 3: Binary Indicators

df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)  # 1 for weekends, 0 for weekdays
print(df.head())

                     Junction  Vehicles            ID            DateTime  \
DateTime                                                                    
2015-01-11 00:00:00       1.0      15.0  2.015110e+10 2015-01-11 00:00:00   
2015-01-11 01:00:00       1.0      13.0  2.015110e+10 2015-01-11 01:00:00   
2015-01-11 02:00:00       1.0      10.0  2.015110e+10 2015-01-11 02:00:00   
2015-01-11 03:00:00       1.0       7.0  2.015110e+10 2015-01-11 03:00:00   
2015-01-11 04:00:00       1.0       9.0  2.015110e+10 2015-01-11 04:00:00   

                     Hour  DayOfWeek  Month  PreviousHourVehicles  IsWeekend  
DateTime                                                                      
2015-01-11 00:00:00     0          6      1                   NaN          1  
2015-01-11 01:00:00     1          6      1                  15.0          1  
2015-01-11 02:00:00     2          6      1                  13.0          1  
2015-01-11 03:00:00     3          6      1                  10.0

In [29]:
#Evaluating feature importance
#Step 1: Correlation Analysis

correlation_matrix = df.corr()
target_correlation = correlation_matrix['Vehicles'].abs().sort_values(ascending=False)
print("Correlation with Target (Vehicles):\n", target_correlation)

Correlation with Target (Vehicles):
 Vehicles                1.000000
PreviousHourVehicles    0.969982
Junction                0.613787
ID                      0.227974
Hour                    0.219938
IsWeekend               0.096628
DayOfWeek               0.084059
Month                   0.016758
Name: Vehicles, dtype: float64


  correlation_matrix = df.corr()


In [33]:
#Step 2: Feature Importance from Random Forest

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

#Train RandomForestRegressor on the imputed data

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_imputed, y)

#Extract feature importance scores

feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature Importance from Random Forest:\n", feature_importance)


Feature Importance from Random Forest:
 PreviousHourVehicles    0.945431
Hour                    0.022886
ID                      0.015612
Junction                0.005546
Month                   0.004960
DayOfWeek               0.004868
IsWeekend               0.000698
dtype: float64


In [9]:
#Step 3: Extract feature importance scores

feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Select the top N most influential features (e.g., top 5 features)

top_features = feature_importance[:5].index.tolist()

# Get the indices of the top features in the original feature matrix X

top_feature_indices = [X.columns.get_loc(feature) for feature in top_features]

# Subset the feature matrix to include only the selected top features

X_selected = X_imputed[:, top_feature_indices]


NameError: name 'rf' is not defined