# Import libraries and data

In [1]:
import pickle

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRFRegressor

from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

import numpy as np
import pandas as pd
import plotly.express as px
from pandas_profiling import ProfileReport

  from pandas_profiling import ProfileReport


import the 60000 rows from data

In [2]:
data=pd.read_parquet('yellow_tripdata_2023-01.parquet')
data=data.iloc[0:60_000]

# Pre-processing

In [3]:
data.columns=[column.lower().replace(' ','_') for column in data.columns]


# Eda

construct the target variable wich consists on duration of the trip

In [4]:
data['duration']=data.tpep_dropoff_datetime-data.tpep_pickup_datetime
data['duration']=data['duration'].apply(lambda t: t.total_seconds()/60)

the histogram of the duration variable

In [5]:
px.histogram(np.log1p(data.duration))

we can find that we do not have na values in any variable

In [6]:
data.isnull().sum()

vendorid                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
ratecodeid               0
store_and_fwd_flag       0
pulocationid             0
dolocationid             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
duration                 0
dtype: int64

In [26]:
profile=ProfileReport(data)
profile.to_file('report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Modeling

## split the data

In [7]:
del data['tpep_pickup_datetime']
del data['tpep_dropoff_datetime']
train,test=train_test_split(data,random_state=123,test_size=.8)
x_train=train.iloc[:,:-1]
x_train=x_train.to_dict(orient='records')
y_train=train.iloc[:,-1].values

x_test=test.iloc[:,:-1]
x_test=x_test.to_dict(orient='records')
y_test=test.iloc[:,-1].values



In [8]:
dv=DictVectorizer(sparse=False)
x_train=dv.fit_transform(x_train)
x_test=dv.transform(x_test)

### Linear regression

In [9]:
lr=LinearRegression()
lr.fit(x_train,y_train)
predict=lr.predict(x_test)
print(f'MAE:{mean_absolute_error(y_test,predict)}')


MAE:7.8481789838313505


### Decision tree

In [10]:
dt = DecisionTreeRegressor()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
print(f'MAE:{mean_absolute_error(y_test,y_pred)}')

MAE:8.160279861111112


In [11]:
scores = []

for m in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    #print('depth: %s' % m)
    for s in [1, 5, 10, 15, 20, 50, 100, 200]:
        dt = DecisionTreeRegressor(max_depth=m, min_samples_leaf=s)
        dt.fit(x_train, y_train)
        y_pred = dt.predict(x_test)
        score = mean_absolute_error(y_test, y_pred)
        
        scores.append((m, s, score))
        
columns = ['max_depth', 'min_sample_leaves', 'mse']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores = df_scores.sort_values(by='mse', ascending=True)
df_scores

Unnamed: 0,max_depth,min_sample_leaves,mse
61,15.0,50,6.704009
69,20.0,50,6.704009
77,,50,6.704182
57,15.0,5,6.741458
65,20.0,5,6.769328
...,...,...,...
3,1.0,15,8.597999
9,2.0,5,8.631401
8,2.0,1,8.644103
1,1.0,5,11.436158


### xgboost

In [12]:
scores = []

for m in np.arange(0,1,.01):
        xgb = XGBRFRegressor(learning_rate=m)
        xgb.fit(x_train, y_train)
        y_pred = dt.predict(x_test)
        score = mean_absolute_error(y_test, y_pred)
        
        scores.append((m, score))
        
columns = ['learning_rate', 'mse']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores = df_scores.sort_values(by='mse', ascending=True)
df_scores

Unnamed: 0,learning_rate,mse
0,0.00,6.829218
72,0.72,6.829218
71,0.71,6.829218
70,0.70,6.829218
69,0.69,6.829218
...,...,...
28,0.28,6.829218
27,0.27,6.829218
26,0.26,6.829218
36,0.36,6.829218


it turns out that the best model is decision tree with the 	20.0max_depth and 50 min_sample_leaves

## Best model

In [13]:
dt= DecisionTreeRegressor(max_depth=20, min_samples_leaf=50)
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
mean_absolute_error(y_test, y_pred)

6.703960746115121

# Saving the model 

using pickle to deciralize the model and the dictvectorizer

In [14]:
with open("model.bin","wb") as model :
    pickle.dump(dt,model)
    
with open("dv.bin","wb") as dvv :
    pickle.dump(dv,dvv)

In [24]:
list(x_test[88])


[1.25,
 2.5,
 79.0,
 5.0,
 40.1,
 1.0,
 0.5,
 1.0,
 1.0,
 264.0,
 1.0,
 1.0,
 0.0,
 9.82,
 0.0,
 60.17,
 9.98,
 2.0]