<a href="https://colab.research.google.com/github/mohan22iitk/Uber-SURGE-Pricing/blob/main/Uber_SURGE_pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Read the CSV files
cab_df = pd.read_csv("/content/cab_rides.csv", delimiter=',', on_bad_lines='skip')
weather_df = pd.read_csv("/content/weather.csv", delimiter=',')

# Proceed with your analysis
print(cab_df.head())
print(weather_df.head())


   distance cab_type     time_stamp    destination            source  price  \
0      0.44     Lyft  1544952607890  North Station  Haymarket Square    5.0   
1      0.44     Lyft  1543284023677  North Station  Haymarket Square   11.0   
2      0.44     Lyft  1543366822198  North Station  Haymarket Square    7.0   
3      0.44     Lyft  1543553582749  North Station  Haymarket Square   26.0   
4      0.44     Lyft  1543463360223  North Station  Haymarket Square    9.0   

   surge_multiplier                                    id    product_id  \
0               1.0  424553bb-7174-41ea-aeb4-fe06d4f4b9d7     lyft_line   
1               1.0  4bd23055-6827-41c6-b23b-3c491f24e74d  lyft_premier   
2               1.0  981a3613-77af-4620-a42a-0c0866077d1e          lyft   
3               1.0  c2d88af2-d278-4bfd-a8d0-29ca77cc5512   lyft_luxsuv   
4               1.0  e0126e1f-8ca9-4f2e-82b3-50505a09db9a     lyft_plus   

           name  
0        Shared  
1           Lux  
2          Lyft  
3 

In [4]:
# Convert timestamps to datetime format
cab_df['date_time'] = pd.to_datetime(cab_df['time_stamp'] / 1000, unit='s')
weather_df['date_time'] = pd.to_datetime(weather_df['time_stamp'], unit='s')

# Check the first few rows of cab_df
print(cab_df.head())

# Ensure there is a common column for merging, such as 'merge_date'
cab_df['merge_date'] = cab_df['date_time'].dt.floor('H')
weather_df['merge_date'] = weather_df['date_time'].dt.floor('H')

# Merge the dataframes on 'merge_date'
final_dataframe = cab_df.merge(weather_df, on='merge_date', suffixes=('', '_w'))

# Drop rows with null values
final_dataframe = final_dataframe.dropna(axis=0)

# Create 'day' and 'hour' columns
final_dataframe['day'] = final_dataframe['date_time'].dt.dayofweek
final_dataframe['hour'] = final_dataframe['date_time'].dt.hour

# Check the final dataframe
print(final_dataframe.head())


   distance cab_type     time_stamp    destination            source  price  \
0      0.44     Lyft  1544952607890  North Station  Haymarket Square    5.0   
1      0.44     Lyft  1543284023677  North Station  Haymarket Square   11.0   
2      0.44     Lyft  1543366822198  North Station  Haymarket Square    7.0   
3      0.44     Lyft  1543553582749  North Station  Haymarket Square   26.0   
4      0.44     Lyft  1543463360223  North Station  Haymarket Square    9.0   

   surge_multiplier                                    id    product_id  \
0               1.0  424553bb-7174-41ea-aeb4-fe06d4f4b9d7     lyft_line   
1               1.0  4bd23055-6827-41c6-b23b-3c491f24e74d  lyft_premier   
2               1.0  981a3613-77af-4620-a42a-0c0866077d1e          lyft   
3               1.0  c2d88af2-d278-4bfd-a8d0-29ca77cc5512   lyft_luxsuv   
4               1.0  e0126e1f-8ca9-4f2e-82b3-50505a09db9a     lyft_plus   

           name                     date_time  
0        Shared 2018-12-16

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataframe['day'] = final_dataframe['date_time'].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataframe['hour'] = final_dataframe['date_time'].dt.hour


In [5]:
#make a coloumn of merge date containing date merged with the location so that we can join the two
cab_df['merge_date'] = cab_df.source.astype(str) +" - "+ cab_df.date_time.dt.date.astype("str") +" - "+ cab_df.date_time.dt.hour.astype("str")
weather_df['merge_date'] = weather_df.location.astype(str) +" - "+ weather_df.date_time.dt.date.astype("str") +" - "+ weather_df.date_time.dt.hour.astype("str")

# change the index to merge_date column so joining the two datasets will not generate any error.

weather_df.index = weather_df['merge_date']

# we ignored surge value of more than 3 because the samples are very less for surge_multiplier›3 surge_dataframe = final_dataframe[final_dataframe.surge_multiplier ‹ 3]
surge_dataframe = final_dataframe[final_dataframe.surge_multiplier < 3]

In [6]:
# feature selection-→› we are selecting the most relevant features from the dataset

x = surge_dataframe[['distance', 'day', 'hour', 'temp' , 'clouds', 'pressure', 'humidity', 'wind', 'rain']]

y = surge_dataframe['surge_multiplier']

In [7]:
le = LabelEncoder()

#ignoring multiplier of 3 as there are only 2 values in our dataset
le.fit([1,1.25,1.5,1.75,2.,2.25,2.5])
y = le.transform(y)

feature_list = list(x.columns)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3 ,random_state=42)

In [8]:
unique, counts = np. unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))

{0: 93710, 1: 1600, 2: 796, 3: 416, 4: 463, 6: 8}


In [9]:
from imblearn.over_sampling import SMOTE
sm = SMOTE( random_state=42)

train_features,train_labels = sm.fit_resample(x_train,y_train)

In [10]:
model = RandomForestClassifier(n_jobs=-1, random_state = 42, class_weight = "balanced")

model.fit(x_train,y_train)
y_pred = model.predict(x_test)

In [11]:
# Get numerical feature importances
importances = list(model. feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature,importance in zip(feature_list,importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: distance             Importance: 0.52
Variable: wind                 Importance: 0.14
Variable: pressure             Importance: 0.1
Variable: hour                 Importance: 0.06
Variable: temp                 Importance: 0.05
Variable: rain                 Importance: 0.04
Variable: clouds               Importance: 0.03
Variable: humidity             Importance: 0.03
Variable: day                  Importance: 0.02


[None, None, None, None, None, None, None, None, None]

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
f1_score = f1_score(y_test,y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)


print("Accuracy Score:", accuracy)
print("F1 Score:", f1_score)

Accuracy Score: 0.958839519834492
F1 Score: 0.955130502965452
