In [1]:

#!pip uninstall numpy -y     # uninstall existing numpy
#!pip install "numpy<2.0"

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.feature_extraction import DictVectorizer
import pickle

In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    
    df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds() / 60.0
    
    df = df[(df.duration > 0) & (df.duration <= 60)]
    
    df['pickup_hour'] = df['lpep_pickup_datetime'].dt.hour
    df['pickup_day'] = df['lpep_pickup_datetime'].dt.dayofweek
    
    features = ['trip_distance', 'pickup_hour', 'pickup_day', 'PULocationID', 'DOLocationID','total_amount','duration']
    
    
    return df[features]
    
    

In [6]:
df_train = read_dataframe('green_tripdata_2021-01.csv')
df_val = read_dataframe('green_tripdata_2021-02.csv')

  df = pd.read_csv(filename)


In [10]:
X_train = df_train.iloc[:, :-1]
y_train = df_train['duration']

X_val = df_val.iloc[:, :-1]
y_val = df_val['duration']

In [11]:
y_train

0         3.933333
1         8.750000
2         5.966667
3         7.083333
4         0.066667
           ...    
76513    38.000000
76514    38.000000
76515    11.000000
76516    27.000000
76517    10.000000
Name: duration, Length: 75290, dtype: float64

In [14]:
### training data

# Convert the DataFrame to a list of dictionaries
X_dict = X_train.to_dict(orient='records')

# Initialize the DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

# Fit and transform the data
X_encoded_dict = dict_vectorizer.fit_transform(X_dict)


### Validation data

X_val_dict = X_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_val_encoded_dict = dict_vectorizer.fit_transform(X_val_dict)



In [17]:
# Initialize and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_encoded_dict, y_train)

#Predict and evaluate the Linear Regression model
y_pred_lr = lr_model.predict(X_val_encoded_dict)

mean_squared_error(y_val, y_pred_lr, squared=False)



7.701738647933279

In [19]:
# save the models
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr_model), f_out)