In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [None]:
# Load the dataset
data = pd.read_csv('/content/chandigarh_route_data_with_traffic.csv')

In [None]:
# Display the first few rows of the data to understand its structure
print(data.head())

   segment_id   length    road_type  \
0  1047478826  220.635  residential   
1    62061377   41.547  residential   
2  1101397654   41.120  residential   
3  1101397654   15.434  residential   
4    62061382  122.148  residential   

                                            geometry  start_lat  start_lon  \
0  LINESTRING (76.8448867 30.7503347, 76.8445799 ...  30.750335  76.844887   
1  LINESTRING (76.7914222 30.77316, 76.7914546 30...  30.773160  76.791422   
2  LINESTRING (76.7914222 30.77316, 76.7918511 30...  30.773160  76.791422   
3  LINESTRING (76.7914222 30.77316, 76.7912613 30...  30.773160  76.791422   
4  LINESTRING (76.7935265 30.7729959, 76.7931338 ...  30.772996  76.793526   

     end_lat    end_lon  temperature weather_type  visibility  wind_speed  \
0  30.749922  76.842673        23.85        Clear       10000        3.17   
1  30.773533  76.791455        24.79        Clear       10000        3.31   
2  30.773129  76.791851        24.79        Clear       10000    

In [None]:
# Preprocess Data
# Convert categorical traffic_condition to numerical format using one-hot encoding
data = pd.get_dummies(data, columns=['traffic_condition'], drop_first=True)

In [None]:
print(data.head())

   segment_id   length    road_type  \
0  1047478826  220.635  residential   
1    62061377   41.547  residential   
2  1101397654   41.120  residential   
3  1101397654   15.434  residential   
4    62061382  122.148  residential   

                                            geometry  start_lat  start_lon  \
0  LINESTRING (76.8448867 30.7503347, 76.8445799 ...  30.750335  76.844887   
1  LINESTRING (76.7914222 30.77316, 76.7914546 30...  30.773160  76.791422   
2  LINESTRING (76.7914222 30.77316, 76.7918511 30...  30.773160  76.791422   
3  LINESTRING (76.7914222 30.77316, 76.7912613 30...  30.773160  76.791422   
4  LINESTRING (76.7935265 30.7729959, 76.7931338 ...  30.772996  76.793526   

     end_lat    end_lon  temperature weather_type  visibility  wind_speed  \
0  30.749922  76.842673        23.85        Clear       10000        3.17   
1  30.773533  76.791455        24.79        Clear       10000        3.31   
2  30.773129  76.791851        24.79        Clear       10000    

In [None]:
# Define the features (X) and target variable (y)
X = data[['length', 'average_speed', 'traffic_condition_low', 'traffic_condition_medium']]
y = data['travel_time']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


In [None]:
# Initialize the model
model = GradientBoostingRegressor(random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Feature importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
print("Feature importances:\n", feature_importances.sort_values(ascending=False))

Feature importances:
 length                      0.684461
average_speed               0.315023
traffic_condition_low       0.000514
traffic_condition_medium    0.000001
dtype: float64
