In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [5]:
df = pd.read_parquet('/content/drive/MyDrive/duration/data/yellow_tripdata_2022-01.parquet')

print('number of columns:', df.shape[1])

df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

print('duration in a min',df.duration.describe()['std'])

num_records = df[(df['duration'] >= 1) & (df['duration'] <= 60)].shape[0]
fraction = num_records / df.shape[0]
percentage = fraction * 100
print('percentage of our data under the given range', percentage)

df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype(str)

number of columns: 19
duration in a min 46.44530513776802
percentage of our data under the given range 98.27547930522405


In [7]:
train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print('number of columns',X_train.shape[1])

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
model =lr.fit(X_train, y_train)

y_pred = model.predict(X_train)

rmse =mean_squared_error(y_train, y_pred, squared=False)
print('root_mean_square on training ',rmse)

number of columns 515
root_mean_square on training  6.986190742248472


#validation

In [8]:
def read_dataframe(filename):
  df = pd.read_parquet(filename)

  df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
  df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

  df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
  df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

  
  df = df[(df.duration >= 1) & (df.duration <= 60)]

  categorical = ['PULocationID', 'DOLocationID']
  df[categorical] = df[categorical].astype(str)
  
  return df

In [12]:
df_train = read_dataframe('/content/drive/MyDrive/duration/data/yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('/content/drive/MyDrive/duration/data/yellow_tripdata_2022-02.parquet')

In [13]:
len(df_train), len(df_val)

(2421440, 2918187)

In [15]:
categorical = ['PULocationID', 'DOLocationID'] 

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [21]:
target = 'duration'
y_val = df_val[target].values
y_pred_val = model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred_val, squared=False)
print('root mean squared error for validation data' , rmse)

root mean squared error for validation data 7.78640662117552
