In [72]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [73]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [74]:
df = pd.read_parquet('/content/drive/MyDrive/mlops_course/fhv_tripdata_2021-01.parquet')

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 61.6+ MB


In [76]:
df.shape

(1154112, 7)

In [77]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime

In [78]:
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [54]:
from statistics import mean
def Average(lst):
    return mean(lst)

In [79]:
average = Average(df.duration)
average

19.167224093791013

In [80]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

In [81]:
percent_missing

dispatching_base_num        0.000000
pickup_datetime             0.000000
dropOff_datetime            0.000000
PUlocationID               83.030676
DOlocationID               14.055828
SR_Flag                   100.000000
Affiliated_base_number      0.076682
duration                    0.000000
dtype: float64

In [82]:
missing_value_df

Unnamed: 0,column_name,percent_missing
dispatching_base_num,dispatching_base_num,0.0
pickup_datetime,pickup_datetime,0.0
dropOff_datetime,dropOff_datetime,0.0
PUlocationID,PUlocationID,83.030676
DOlocationID,DOlocationID,14.055828
SR_Flag,SR_Flag,100.0
Affiliated_base_number,Affiliated_base_number,0.076682
duration,duration,0.0


In [83]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [84]:
object_list=pd.get_dummies(df["PUlocationID"], prefix="PUlocationID")

In [85]:
df_1=df.join(object_list)

In [86]:
df_1.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration,PUlocationID_1.0,PUlocationID_2.0,...,PUlocationID_256.0,PUlocationID_257.0,PUlocationID_258.0,PUlocationID_259.0,PUlocationID_260.0,PUlocationID_261.0,PUlocationID_262.0,PUlocationID_263.0,PUlocationID_264.0,PUlocationID_265.0
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
object_list2=pd.get_dummies(df["DOlocationID"], prefix="DOlocationID")

In [88]:
df_2=df_1.join(object_list2)
df_2.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration,PUlocationID_1.0,PUlocationID_2.0,...,DOlocationID_256.0,DOlocationID_257.0,DOlocationID_258.0,DOlocationID_259.0,DOlocationID_260.0,DOlocationID_261.0,DOlocationID_262.0,DOlocationID_263.0,DOlocationID_264.0,DOlocationID_265.0
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333,0,0,...,0,0,0,0,0,0,0,0,0,0
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
 7   duration                1154112 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(3)
memory usage: 70.4+ MB


In [111]:
data=df[["PUlocationID","DOlocationID","duration"]]

In [112]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 3 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   PUlocationID  195845 non-null   float64
 1   DOlocationID  991892 non-null   float64
 2   duration      1154112 non-null  float64
dtypes: float64(3)
memory usage: 26.4 MB


In [113]:
data.dropna(axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184404 entries, 39 to 1154110
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   PUlocationID  184404 non-null  float64
 1   DOlocationID  184404 non-null  float64
 2   duration      184404 non-null  float64
dtypes: float64(3)
memory usage: 5.6 MB


In [115]:

y=data["duration"]
X=data[["PUlocationID","DOlocationID"]]

In [116]:
lr=LinearRegression().fit(X,y)
y_pred=lr.predict(X)
MSE=mean_squared_error(y,y_pred,squared=False)
MSE


43.12784346427104

In [117]:
import numpy as np

RMSE=np.sqrt(MSE)
RMSE

6.567179262382826