# NYC Cab Trip Duration Prediction

### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from warnings import filterwarnings
filterwarnings("ignore")


In [2]:
df1=pd.read_parquet('yellow_tripdata_2022-01.parquet')
df2=pd.read_parquet('yellow_tripdata_2022-02.parquet')
df3=pd.read_parquet('yellow_tripdata_2022-03.parquet')
df4=pd.read_parquet('yellow_tripdata_2022-04.parquet')
df5=pd.read_parquet('yellow_tripdata_2022-05.parquet')
df6=pd.read_parquet('yellow_tripdata_2022-06.parquet')

data=pd.concat([df1.sample(121553) ,df2.sample(121553) ,df3.sample(121553) ,df4.sample(121553) ,df5.sample(121553) ,df6.sample(121553)  ])

FileNotFoundError: [Errno 2] No such file or directory: 'yellow_tripdata_2022-01.parquet'

Reading the data from praquet files for the months of January to June

In [None]:
data

### Data preprocessing/Munging

In [None]:

data = data.reset_index()

In [None]:
data

In [None]:
data=data.drop(labels=['index'],axis=1)

In [None]:
data

Read another dataset that contains NYC Taxi Zones

In [None]:
df=pd.read_csv("taxi_zones.csv")

In [None]:
df

In [None]:
from geopy.geocoders import ArcGIS
import folium as f

In [None]:

loc=ArcGIS()

In [None]:
nyc=loc.geocode("New York") 

Using geopy.geocoders we fetch the geocodes of New York

In [None]:
print(nyc.latitude)
print(nyc.longitude)

Above is the latitude and longitude of New York

In [None]:
# data =df['zone'].sample(50)
#data.apply(loc.geocode)

In [None]:
df["Location"]=df["zone"].apply(loc.geocode)

Apply geocode to our datasets 'data' and 'zone'

In [None]:
df["longitude"]=df["Location"].apply(lambda x: x.longitude)
df["latitude"]=df["Location"].apply(lambda x: x.latitude)

Seperating longitude and latitude from location column

In [None]:
df

In [None]:
df=df.drop(columns=["OBJECTID","Shape_Leng","the_geom","Shape_Area","borough","Location"])
df

Drop not so important columns

In [None]:
data

In [None]:
data1=data.merge(df, left_on='PULocationID',right_on='LocationID')
data1

Merging two datasets to make as one whole dataset

In [None]:
data1 = data1.rename(columns={'longitude': 'pickup_longitude', 'latitude': 'pickup_latitude'})


In [None]:
data1=data1.drop(['LocationID','zone'],axis=1)
data1

In [None]:
data1=data1.merge(df, left_on='DOLocationID',right_on='LocationID')


In [None]:
data1 = data1.rename(columns={'longitude': 'dropoff_longitude', 'latitude': 'dropoff_latitude'})
data1=data1.drop('LocationID',axis=1)
data1

In [None]:
data1.to_csv("final_yellow_tripdata.csv")

converted parquet data to csv

In [None]:
dk_time = pd.to_datetime(data1["tpep_dropoff_datetime"]);
pk_time = pd.to_datetime(data1["tpep_pickup_datetime"]);
D=round(abs(dk_time - pk_time)/np.timedelta64(1,"s") / 60)

data1["trip_duration"]=D
data1

Converting 'tpep_dropoff_datetime' and 'tpep_pickup_datetime' columns from string to date-time format.
Also calculate 'trip_duration' based on pickup and dropoff time

In [None]:
df=data1.sample(300000)

Restricting data to 30000 rows for faster calculations

In [None]:
df.sample(20)    # print top view of data set 

In [None]:
df.tail(20) #bottom view

Drop statasticaly unimportants columns

In [None]:
df=df.drop(labels=['VendorID','RatecodeID','store_and_fwd_flag'],axis=1)

In [None]:
df.columns  #columns overview

In [None]:
df.shape

In [None]:
df.isna().sum()     

In [None]:
df=df.dropna()

In [None]:
df.describe()    #statastical summary 

### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(10,10))                   #Analysis using heatmap
sb.heatmap(df.corr(),annot=True,fmt='.1f')
plt.show()

Plot heat map to understand how the variables behave with each other. 

It is found that 'trip_duration' and 'fare_amount' are highly correlated. We remove 'fare_amount' to avoid multicollinearity.

In [None]:
df=df.drop(labels=["fare_amount"],axis=1)    #Removed the column due to high correlation


**Multicollinearity** occurs when two or more independent variables(also known as predictors) are highly correlated with one another in a regression model. 

This means that an independent variable can be predicted from another independent variable in a regression model. In our use case, there is no multicollinearity


In [None]:
plt.figure(figsize=(20,20))
sb.heatmap(df.corr(),vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True)
plt.tight_layout()
plt.show()

Heat map after removing multicollinearity 

In [None]:
sb.distplot(df['trip_duration'], kde = True, bins = 5)   #Analysis of target colunmn

In [None]:
sb.boxplot(x="passenger_count", y="trip_duration",data=df);
plt.title("Average Trip Duration for Number of Passengers");
plt.xlabel("passenger_count");
plt.ylabel("Trip Duration");

The plot for 'Average Trip Duration for Number of Passengers' shows more number of cab trips take place with less number of people on board generally with 1 or 2 passengers

In [None]:
sb.barplot(x="passenger_count", y="trip_duration",data=df);
plt.title("Average Trip Duration for Number of Passengers");
plt.xlabel("passenger_count");
plt.ylabel("Trip Duration");

In [None]:
df['tpep_pickup_datetime']=pd.to_datetime(df["tpep_pickup_datetime"])
df['tpep_dropoff_datetime']=pd.to_datetime(df["tpep_dropoff_datetime"])

In [None]:
print(type(df['tpep_pickup_datetime']))
print(type(df['tpep_dropoff_datetime']))

In [None]:
#Function to convert date-time features into date & time 
def convert_to_date_dtype(Dataframe,col):
   
    Dataframe[col] = pd.to_datetime(Dataframe[col], format= '%d-%m-%Y %H:%M')
    Dataframe[col+'_day'] = Dataframe[col].dt.dayofweek
    Dataframe[col+'_month'] = Dataframe[col].dt.month
    Dataframe[col+'_hour'] = Dataframe[col].dt.hour
    Dataframe[col+'_minute'] = Dataframe[col].dt.minute

Function to convert 'tpep_pickup_datetime' and 'tpep_dropoff_datetime' from object column to datetime column. We can confirm it in the below cell.

In [None]:

convert_to_date_dtype(df, 'tpep_pickup_datetime')
convert_to_date_dtype(df, 'tpep_dropoff_datetime')

df[['tpep_pickup_datetime', 'tpep_dropoff_datetime']].info()

'tpep_pickup_datetime' and 'tpep_dropoff_datetime' columns are further divided into respective tpep_pickup_datetime_day, tpep_pickup_datetime_month, tpep_pickup_datetime_hour, tpep_pickup_datetime_minute columns and tpep_dropoff_datetime_day, tpep_dropoff_datetime_month, tpep_dropoff_datetime_hour, tpep_dropoff_datetime_minute columns for accurate prediction.

In [None]:

df

Map for one pickup and dropoff location based on the latitude and longitude of New york

In [None]:
map=f.Map(location=[40.71455000000003,-74.00713999999994])
map.add_child(f.Marker(location=[40.71455000000003,-74.00713999999994])) #map view of New York city

In [None]:
import folium

location = [40.730610, -73.935242]
location2=[40.708469,-74.017120]

map=folium.Map(location=location ,width=800,hight=400,zoom_start = 10,)


* Map view for Pickup Points

In [None]:
pickup =df[["pickup_longitude",'pickup_latitude']]
pickup
data = list(zip(pickup.pickup_latitude.values,
                            pickup.pickup_longitude.values,
                            ))
type(data[0][0])

In [None]:
pickup = pd.DataFrame(data)
pickup.head()

In [None]:
from folium.plugins import HeatMap    # import library heatmap
pickup_map = folium.Map(location = location, zoom_start = 10,)

hm_wide = HeatMap( pickup.values,
                     min_opacity= 0.2,
                     radius= 5, blur= 8,
                     max_zoom= 1 
                 )

pickup_map.add_child(hm_wide)

pickup_map

* Map view for Dropoff Points

In [None]:
dropoff = pd.DataFrame(data)
dropoff=dropoff.dropna()
dropoff.head()

In [None]:
dropoff_map = f.Map(location = location, zoom_start = 10,)


hm_wide = HeatMap( dropoff.values,
                     min_opacity= 0.2,
                     radius= 5, blur= 8,
                     max_zoom= 1 
                 )

dropoff_map.add_child(hm_wide)

dropoff_map

The blue coloured saturation is for the different areas of New York for both pickup and dropoff  

In [None]:
df=df.drop(labels=["tpep_pickup_datetime","tpep_dropoff_datetime"],axis=1)  #Droping the columns

In [None]:
df.info()  #Overview/Summary of Dataframe

In [None]:
df.head()

In [None]:
df['passenger_count'].value_counts()

In [None]:
df['passenger_count'] = df['passenger_count'].fillna(1)

In [None]:
df['passenger_count']

In [None]:
df.isna().sum()

In [None]:
df["passenger_count"].unique()   #cheking data element in the columns}

In [None]:
plt.figure(figsize=(16, 6)) 
plt.title('tpep_pickup_datetime_hour')
plt.xlabel('Different Times of the Day')

sb.countplot(x='tpep_pickup_datetime_hour', data=df, palette=("crest"))


In [None]:
plt.figure(figsize=(16, 6)) 
plt.title('hour')
plt.xlabel('Different Times of the Day')

sb.countplot(x='tpep_dropoff_datetime_hour', data=df, palette=("crest"))


The graphs show count of passengers using the taxi at each hour of the day. It is clear that the city
has high traffic from 8 am  to 11 pm as most people travel to work at these hours and traffic peaks at evening 5 pm to 7 pm.

In [None]:
plt.figure(figsize=(16, 6)) 
plt.title('Pick-offs on Seven Days of a Week')
plt.xlabel("Days of a Week")

sb.countplot(x='tpep_pickup_datetime_day', data=df, palette=("crest"))


In [None]:
plt.figure(figsize=(16, 6)) 
plt.title('Drop-offs on Seven Days of a Week')
plt.xlabel('Days of a  Week')

sb.countplot(x='tpep_dropoff_datetime_day', data=df, palette=("crest"))


The graphs show how the traffic varies from weekdays to weekends
0 - Sunday
1 - Monday
2 - Tuesday
3 - Wednesday
4 - Thursady
5 - Friday
6 - Saturday

* Cheking outliers

In [None]:
sb.boxplot(x=df["passenger_count"])

In [None]:
sb.boxplot(x=df["trip_duration"])

In [None]:
df["trip_duration"].value_counts()

In [None]:
sb.boxplot(x=df["tpep_pickup_datetime_day"])

In [None]:
sb.boxplot(x=df["tpep_pickup_datetime_month"])

In [None]:
sb.boxplot(x=df["tpep_pickup_datetime_hour"])

In [None]:
sb.boxplot(x=df["tpep_pickup_datetime_minute"])

In [None]:
sb.boxplot(x=df["tpep_dropoff_datetime_day"])

In [None]:
sb.boxplot(x=df["tpep_dropoff_datetime_month"])

In [None]:
sb.boxplot(x=df["tpep_dropoff_datetime_hour"])

In [None]:
sb.boxplot(x=df["tpep_dropoff_datetime_minute"])

In [None]:
sb.boxplot(x=df["pickup_longitude"])

In [None]:
sb.boxplot(x=df["pickup_latitude"])

In [None]:
sb.boxplot(x=df["dropoff_latitude"])

In [None]:
sb.boxplot(x=df["dropoff_longitude"])

In [None]:
df.columns

In [None]:
df=df.drop(labels=['trip_distance','PULocationID', 'DOLocationID',
       'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount','total_amount',
       'improvement_surcharge','congestion_surcharge',
       'airport_fee'],axis=1)

df

In [None]:
df.skew()

skewness before outlier treatment

In [None]:
def outlier_detect(df):
    for i in df.describe().columns:
        Q1=df.describe().at['25%',i]
        Q3=df.describe().at['75%',i]
        IQR=Q3 - Q1
        LTV=Q1 - 1.5 * IQR
        UTV=Q3 + 1.5 * IQR
        x=np.array(df[i])
        p=[]
        for j in x:
            if j < LTV or j>UTV:
                p.append(df[i].median())
            else:
                p.append(j)
        df[i]=p
    return df

Function for outlier removal : first we teke 25% data as Q1 and 75% data as Q3. Then we define Inter Quartile Range(IQR) as Q3-Q1. 
Then we define Lower Tube Values(LTV) and Upper Tube Values(UTV). 

In [None]:
new_df=outlier_detect(df)

New_df After removing outliers.

In [None]:
new_df[["trip_duration"]]

In [None]:
new_df["passenger_count"].value_counts()

In [None]:
sb.boxplot(x=new_df["passenger_count"])

In [None]:
sb.boxplot(x=new_df["trip_duration"])

In [None]:
sb.boxplot(x=new_df["tpep_pickup_datetime_month"])

In [None]:
sb.boxplot(x=new_df["tpep_pickup_datetime_day"])

In [None]:
sb.boxplot(x=new_df["tpep_pickup_datetime_hour"])

In [None]:
sb.boxplot(x=new_df["tpep_pickup_datetime_minute"])

In [None]:
sb.boxplot(x=new_df["tpep_dropoff_datetime_day"])

In [None]:
sb.boxplot(x=new_df["tpep_dropoff_datetime_month"])

In [None]:
sb.boxplot(x=new_df["tpep_dropoff_datetime_hour"])

In [None]:
sb.boxplot(x=new_df["tpep_dropoff_datetime_minute"])

In [None]:
sb.boxplot(x=df["pickup_longitude"])

In [None]:
sb.boxplot(x=df["pickup_latitude"])

In [None]:
sb.boxplot(x=df["dropoff_longitude"])

In [None]:
sb.boxplot(x=df["dropoff_latitude"])

In [None]:
# cheking skewness

In [None]:
# new_df.skew()
new_df.skew()

In [None]:
sb.distplot(new_df['passenger_count'], kde = True, bins = 5) 

In [None]:
# new_df.shape
new_df.shape

Dividing columns into categorical and continious

In [None]:
cat=[]
con=[]
for i in new_df:
    if(df[i].dtypes=="object"):
        cat.append(i)
    else:
        con.append(i)

In [None]:
cat

In [None]:
new_df.head()

In [None]:
con

### Model Building

In [None]:
Y=new_df[["trip_duration"]]                    #Dependent Variable
X=new_df.drop(labels=["trip_duration","zone"],axis=1)  #Independent Variables

X.columns

### Standerdizing the Data 


Standardization is about making sure that data is internally consistant and each data type has same content and format so that all variables are contributing in prediction.

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
Xnew=pd.DataFrame(sc.fit_transform(X))

In [None]:
Xnew 

In [None]:
Xnew.columns=['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'tpep_pickup_datetime_day',
       'tpep_pickup_datetime_month', 'tpep_pickup_datetime_hour',
       'tpep_pickup_datetime_minute', 'tpep_dropoff_datetime_day',
       'tpep_dropoff_datetime_month', 'tpep_dropoff_datetime_hour',
       'tpep_dropoff_datetime_minute']
#Adding the colums name

In [None]:
Xnew

In [None]:
Xnew.columns   #columns overview

Spliting Data into training and testing set

Split the data into training and testing as 67% and 33% respectively.

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(Xnew,Y,test_size=0.33,random_state=21)

In [None]:
xtest

In [None]:
xtrain

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
Model=lr.fit(xtrain,ytrain)
pred_tr=Model.predict(xtrain)
pred_ts=Model.predict(xtest)

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))

In [None]:
# Visualize the performance of the model

import matplotlib.pyplot as plt

plt.plot(xtrain,pred_tr,color='r')
plt.plot(X,Y,'b.')
plt.xlabel("Xtrain")
plt.ylabel("pred_tr")
plt.show()

# Regularization of the model

# Ridge

In [None]:
tg = {"alpha":[0.991,0.992,0.993,0.994,0.995,0.996,0.997,0.998,0.999,1.001,1.002,1.003,1.004,1.005,1.006,1.007,1.008,1.009]}

In [None]:
from sklearn.linear_model import Ridge,Lasso
rr = Ridge()



from sklearn.model_selection import GridSearchCV
cv1 = GridSearchCV(rr,tg,scoring="neg_mean_absolute_error",cv=4)
cvmodel1 = cv1.fit(xtrain,ytrain)
cvmodel1.best_params_

In [None]:
rr = Ridge(alpha=1.009)

model = rr.fit(xtrain,ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


plt.plot(xtrain,pred_tr,color='r')
plt.plot(X,Y,'b.')
plt.xlabel("xtrain")
plt.ylabel("pred_tr")
plt.show()

# Lasso

In [None]:
ls = Lasso()
from sklearn.model_selection import GridSearchCV
cv2 = GridSearchCV(ls,tg,scoring="neg_mean_absolute_error",cv=4)
cvmodel2 = cv2.fit(xtrain,ytrain)
cvmodel2.best_params_

In [None]:

ls = Lasso(alpha=1.009)

model = ls.fit(xtrain,ytrain)

pred_tr = model.predict(xtrain)
pred_ts = model.predict(xtest)

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


plt.plot(xtrain,pred_tr,color='r')
plt.plot(X,Y,'b.')
plt.xlabel("xtrain")
plt.ylabel("pred_tr")
plt.show()

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures  
poly_regs= PolynomialFeatures(degree= 3)  
xtrain_poly= poly_regs.fit_transform(xtrain)
xtest_poly= poly_regs.fit_transform(xtest)
lin_reg_2 =LinearRegression()  
model=lin_reg_2.fit(xtrain_poly,ytrain)  
pred_tr=model.predict(xtrain_poly)
pred_ts=model.predict(xtest_poly)

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


plt.plot(xtrain,pred_tr,color='r')
plt.plot(X,Y,'b.')
plt.xlabel("xtrain")
plt.ylabel("pred_tr")
plt.show()

In [None]:
lin_reg_2.score(xtest_poly,ytest)

# Support Vector Regression

In [None]:
# from sklearn.svm import SVR
# svr = SVR(kernel = 'linear')
# model=svr.fit(xtrain,ytrain)
# pred_tr=model.predict(xtrain)
# pred_ts=model.predict(xtest)
# #
# from sklearn.metrics import mean_absolute_error,mean_squared_error
# print('MAE:', mean_absolute_error(ytest, pred_ts))
# print('MSE:', mean_squared_error(ytest, pred_ts))
# print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
# print('R2', r2_score(ytest,pred_ts))
# print("------------------------------------------------")
# tr_err=mean_absolute_error(ytrain,pred_tr)
# ts_err=mean_absolute_error(ytest,pred_ts)
# print("Training error=",tr_err)
# print("Testing error=",ts_err)
# print("-------------------------------------------------")

# if(tr_err<ts_err):
#     print("model is overfited")
# else:
#     print("model is underfited")

In [None]:
# plt.plot(xtrain,pred_tr,color='r')
# plt.plot(X,Y,'b.')
# plt.xlabel("xtrain")
# plt.ylabel("pred_tr")
# plt.show()

# Knn Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor(n_neighbors=5)
model = knr.fit(xtrain,ytrain)
pred = model.predict(xtest)
from sklearn.metrics import mean_absolute_error,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


# Hyperparameter tuning for KNeighborsRegressor

In [None]:
for i in range(2,10,1):
    from sklearn.neighbors import KNeighborsRegressor
    knr = KNeighborsRegressor(n_neighbors=i)
    model = knr.fit(xtrain,ytrain)
    pred = model.predict(xtest)
    
    from sklearn.metrics import mean_absolute_error
    print(i,mean_absolute_error(ytest,pred))

plt.plot(xtrain,pred_tr,color='r')
plt.plot(X,Y,'b.')
plt.xlabel("xtrain")
plt.ylabel("pred_tr")
plt.show()

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor(max_depth=10,random_state=10)
Model=dtr.fit(xtrain,ytrain)
pred_tr=Model.predict(xtrain)
pred_ts=Model.predict(xtest)

from sklearn.metrics import mean_absolute_error,accuracy_score,r2_score,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


plt.plot(xtrain,pred_tr,color='r')
plt.plot(X,Y,'b.')
plt.xlabel("xtrain")
plt.ylabel("pred_tr")
plt.show()

In [None]:
# pip install graphviz

In [None]:
from graphviz import *
from sklearn.tree import DecisionTreeClassifier, plot_tree
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
plot_tree(dtr,
               feature_names = xtrain.columns,
               class_names=new_df.trip_duration,
               filled = True);
fig.savefig('imagename.png')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=20,random_state=21)
Model=rf.fit(xtrain,ytrain)
pred=Model.predict(xtrain)
pred_tr=Model.predict(xtrain)
pred_ts=Model.predict(xtest)
from sklearn.metrics import mean_absolute_error,accuracy_score,r2_score,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


We have MAE as 2.488 and RMSE as 3.799.R2 score is 0.7472 which is close to 1. Thus we can say that the model is predicting with minimal errors and good accuracy.

# Adaboost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
adb=AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),n_estimators=500,learning_rate=0.1,random_state=21)
model=adb.fit(xtrain,ytrain)
pred_tr=Model.predict(xtrain)
pred_ts=Model.predict(xtest)
from sklearn.metrics import mean_absolute_error,accuracy_score,r2_score,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts))
print('MSE:', mean_squared_error(ytest, pred_ts))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts)))
print('R2', r2_score(ytest,pred_ts))


In [None]:
pred_tr

# XGboost

In [None]:
#pip install xgboost

In [None]:
from xgboost import XGBRegressor #XGboostgressor
xgbr=XGBRegressor()
model=xgbr.fit(xtrain,ytrain)


In [None]:
pred_tr1=Model.predict(xtrain)
pred_ts1=Model.predict(xtest)
from sklearn.metrics import mean_absolute_error,accuracy_score,r2_score,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts1))
print('MSE:', mean_squared_error(ytest, pred_ts1))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts1)))
print('R2', r2_score(ytest,pred_ts1))



# BayesianRidge

In [None]:
from sklearn.linear_model import BayesianRidge
bysn=BayesianRidge()
model=bysn.fit(xtrain,ytrain)
pred_tr1=Model.predict(xtrain)
pred_ts1=Model.predict(xtest)
from sklearn.metrics import mean_absolute_error,accuracy_score,r2_score,mean_squared_error
print('MAE:', mean_absolute_error(ytest, pred_ts1))
print('MSE:', mean_squared_error(ytest, pred_ts1))
print('RMSE:', np.sqrt(mean_squared_error(ytest, pred_ts1)))
print('R2', r2_score(ytest,pred_ts1))


In [None]:
ytest

In [None]:
ytest.value_counts()

In [None]:
pred_ts1

### create a model pickle file

In [None]:
# Random Forest_model
# pickle.dump(rf, open('r_model.pkl', 'wb'))
import pickle
pickle.dump(rf, open('R_model.pkl', 'wb'))

 Unique values columns

In [None]:
fdf=pd.read_csv("final_yellow_tripdata.csv")
fdf

In [None]:
pickup_longitude=pd.DataFrame(fdf.pickup_longitude.unique())
pickup_longitude.columns=["pickup_longitude"]
pickup_longitude.to_csv('final_taxi_data_pickup_longitude.csv')
pickup_longitude.head()

In [None]:
pickup_latitude=pd.DataFrame(fdf.pickup_latitude.unique())
pickup_latitude.columns=["pickup_latitude"]
pickup_latitude.to_csv('final_taxi_data_pickup_latitude.csv')
pickup_latitude.head()

In [None]:
dropoff_longitude=pd.DataFrame(df.dropoff_longitude.unique())
dropoff_longitude.columns=["dropoff_longitude"]
dropoff_longitude.to_csv('final_taxi_data_dropoff_longitude.csv')
dropoff_longitude.head()

In [None]:
dropoff_latitude=pd.DataFrame(df.dropoff_latitude.unique())
dropoff_latitude.columns=["dropoff_latitude"]
dropoff_latitude.to_csv('final_taxi_data_dropoff_latitude.csv')
dropoff_latitude.head()

In [None]:
C=pd.DataFrame(data['tpep_dropoff_datetime'].dt.time)
C=C.sort_values(by='tpep_dropoff_datetime',ascending=True)
C['tpep_dropoff_datetime']=A['tpep_dropoff_datetime'].apply(lambda t: t.replace(second=0))

D=( C.sort_values('tpep_dropoff_datetime')
    .drop_duplicates('tpep_dropoff_datetime',keep = 'first')
    .reset_index(drop=True)
    .assign(a = lambda x: x.index + 1) )

D=pd.DataFrame(B['tpep_dropoff_datetime'])

D.to_csv("dk_time.csv")
D