In [1]:
import pandas as pd
import sklearn
sklearn.__version__
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import pickle
import math
import mlflow

# Creating read dataset function
Accepts 4 parameters:
- filename
- min_lim
- max_lim
- threshold

If threshold is setup to true, min_lim and max_lim allows to filter the dataset based on minimun and maximum values of duration column.

In [40]:
def read_dataset(filename,min_lim=0,max_lim=0,threshold=False):
    df=pd.read_parquet(filename)
    df.tpep_dropoff_datetime=pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime=pd.to_datetime(df.tpep_pickup_datetime)
    df['duration']=df.tpep_dropoff_datetime-df.tpep_pickup_datetime
    df['duration'] = df['duration'].dt.total_seconds().div(60).astype(float)
    if threshold==False:
        pass
    else:
        df=df[(df.duration>=min_lim)&(df.duration<=max_lim)]
    
    categorical=['PULocationID','DOLocationID']
    numerical=['trip_distance']
    df[categorical]=df[categorical].astype(str)
    return df

# Reading datasets and comparing before and after filtering by duration
we have 98.1220282212598 % left of the original dataset rows after filtering by duration >1 && duration<60 minutes

In [44]:
df_train_1_60=read_dataset(
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
        threshold=True,
        min_lim=1,
        max_lim=60)
df_train=read_dataset(
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet',
        threshold=False,
        min_lim=1,
        max_lim=60)
df_test=read_dataset(
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
        threshold=False,
        min_lim=1,
        max_lim=60)
df_test_1_60=read_dataset(
        'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet',
        threshold=True,
        min_lim=1,
        max_lim=60)
# df_train_1_60=read_dataset_2('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
# df_test_1_60=read_dataset_2(filename='https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')


In [4]:
len(df_train_1_60)/len(df_train)*100,len(df_test_1_60)/len(df_test)*100


(98.1220282212598, 98.00944077722545)

In [5]:
categorical=['PULocationID','DOLocationID']
numerical=['trip_distance']
target='duration'
dv=DictVectorizer()
train_dicts=df_train_1_60[categorical+numerical].to_dict(orient='records')
X_train=dv.fit_transform(train_dicts)

In [6]:
test_dicts=df_test_1_60[categorical+numerical].to_dict(orient='records')
X_test=dv.transform(test_dicts)

# Counting the feature_names obtained with dictionary vectorizer
we have 516 columns after applying one hot encoding

In [46]:
len(dv.feature_names_)


516

In [10]:
target='duration'
y_train=df_train_1_60[target].values
y_test=df_test_1_60[target].values


# Training model and validating against train and test set
we obtain a MSE of 7.658396474418433 against train set vs 7.820267421957621 against test set

In [26]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_train)
mean_squared_error(y_train,y_pred,squared=False)

7.658396474418433

In [27]:
y_pred=lr.predict(X_test)
mean_squared_error(y_test,y_pred,squared=False)

7.820267421957621

# Saving Model in .bin

In [12]:
with open ('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)

FileNotFoundError: [Errno 2] No such file or directory: 'models/lin_reg.bin'

# Ammount of columns : 20 

In [13]:
len(df_train.columns)

20

# Standard deviation of duration col : 42.594351241920904

In [14]:
df_train['duration'].std()

42.594351241920904

# Ammount of features after one hot encoding: 516

In [16]:
len(dv.feature_names_)

516

In [17]:
dv.feature_names_

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109',
 'DOLocationID=11',
 'DOLocationID=111',
 'DOLocationID=112',
 'DOLocationID=113',
 'DOLocationID=114',
 'DOLocationID=115',
 'DOLocationID=116',
 'DOLocationID=117',
 'DOLocationID=118',
 'DOLocationID=119',
 'DOLocationID=12',
 'DOLocationID=120',
 'DOLocationID=121',
 'DOLocationID=122',
 'DOLocationID=123',
 'DOLocationID=124',
 'DOLocationID=125',
 'DOLocationID=126',
 'DOLocationID=127',
 'DOLocationID=128',
 'DOLocationID=129',
 'DOLocationID=13',
 'DOLocationID=130',
 'DOLocationID=131',
 'DOLocationID=132',
 'DOLocationID=133',
 'DOLocationID=134',
 'DOLocationID=135',
 'DOLocationID=136',
 'DOLocationID=137',
 'DOLocationID=138',
 'DOLocationID=139',
 'DOLocationID=14',
 'DOLocationID=140',
 'DOLocationID=141',
 'DOLocationID=142',
 'DOLocationID=143',
 'DOLocationID=144',
 'DOLocationID=145',

In [None]:
df_train['duration'].std()