# mlops-zoomcamp 01-intro

In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [3]:
def read_trip_data(month: str, converters: dict) -> pd.DataFrame:
    data_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-{month}.parquet"
    df = pd.read_parquet(data_url).astype(converters)       
    return df

In [4]:
MINUTE_IN_SEC = 60

def compute_duration(df: pd.DataFrame) -> pd.DataFrame:
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])
    df['duration'] = df['duration'].dt.total_seconds().div(MINUTE_IN_SEC)
    return df

In [5]:
MIN_DURATION_MIN = 1
MAX_DURATION_MIN = 60

def drop_outlayers(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[(df['duration'] >= MIN_DURATION_MIN) & (df['duration'] <= MAX_DURATION_MIN)]

In [6]:
def preprocess(df: pd.DataFrame, verbose=False):
    df = compute_duration(df)
    
    if verbose:
        print(f"Q2 - The standard deviation of the trips duration is {df['duration'].std():.2f} .")

    before_drop = len(df)
    df = drop_outlayers(df)
    after_drop = len(df)
    
    if verbose:
        print(f"Q3 - {((after_drop / before_drop) * 100):.2f} % of the records left after droping the outliers.")

    return df

In [7]:
target_column = 'duration'
feature_columns = ['PULocationID',	'DOLocationID']
feature_columns_converter = {column: str for column in feature_columns}

In [8]:
df_january_raw = read_trip_data(month="01", converters=feature_columns_converter)
df_february_raw = read_trip_data(month="02", converters=feature_columns_converter)

print(f"Q1 - There are {len(df_january_raw.columns)} columns for January data.")

Q1 - There are 19 columns for January data.


In [9]:
df_train = preprocess(df_january_raw, verbose=True)
df_val = preprocess(df_february_raw, verbose=False)

Q2 - The standard deviation of the trips duration is 42.59 .
Q3 - 98.12 % of the records left after droping the outliers.


In [10]:
dv = DictVectorizer()

# train
train_dicts = df_train[feature_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target_column].values

print(f"Q4 - The dimensionality of this matrix is {X_train.shape} with {X_train.shape[1]} number of columns.")

# validation
val_dicts = df_val[feature_columns].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target_column].values

Q4 - The dimensionality of this matrix is (3009173, 515) with 515 number of columns.


In [11]:
lr = LinearRegression()

# train
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)

rmse_train = root_mean_squared_error(y_train, y_train_pred)
print(f"Q5 - The RMSE of a training set: {rmse_train:.5f}.")


# validation
y_val_pred = lr.predict(X_val)

rmse_val = root_mean_squared_error(y_val, y_val_pred)
print(f"Q6 - The RMSE of a validation set: {rmse_val:.5f}.")

Q5 - The RMSE of a training set: 7.64926.
Q6 - The RMSE of a validation set: 7.81182.
