In [1]:
import pandas as pd
import seaborn as sns 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ModuleNotFoundError: No module named 'pandas'

In [5]:
import numpy

In [11]:
def create_dataframe(filename: str) -> pd.DataFrame:
    # read dataframe
    df = pd.read_parquet(filename)

    # convert from str to datetime
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

    # calculate duration
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

    # filter only in the range [1, 60] minutes
    df = df[(df['duration'] >= 1) & (df["duration"] <= 60)]

    # convert categorial data to str type
    categorial = ["PULocationID", "DOLocationID"]
    df[categorial] = df[categorial].astype(str)

    return df

In [10]:
def preprocess(df: pd.DataFrame, dv: DictVectorizer, dv_fit: bool=False) -> tuple[pd.DataFrame, DictVectorizer]:
    # combine categorial data
    df['PU|DO'] = df['PULocationID'] + '|' + df['DOLocationID']
    categorical = ['PU|DO']
    numerical = ['trip_distance']

    # vectorize the data
    dicts = df[categorical + numerical].to_dict(orient='records')
    if dv_fit:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)

    return X, dv

In [13]:
# create dataset for model
df_train = create_dataframe("../data/yellow_tripdata_2023-01.parquet")
df_test = create_dataframe("../data/yellow_tripdata_2023-02.parquet")

In [16]:
# fit the DictVectorizer and preprocess data
dv = DictVectorizer()
X_train, dv = preprocess(df_train, dv, dv_fit=True)
X_test, _ = preprocess(df_test, dv, dv_fit=False)

In [17]:
# set the target
target = 'duration'
y_train = df_train[target].values
y_test = df_test[target].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [19]:
# mse for train dataset
y_pred = lr.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred, squared=False)

# mse for test dataset
y_pred = lr.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred, squared=False)

In [20]:
print("Mean Squared Error (Train):", mse_train)
print("Mean Squared Error (Test):", mse_test)

Mean Squared Error (Train): 5.133254051206866
Mean Squared Error (Test): 5.2474256957603975
