# mlops-zoomcamp
### 01-intro

In [93]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

#### Q1. Downloading the data

In [75]:
df_january = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")

In [76]:
df_january.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [77]:
df_february = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [78]:
df_february.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0


In [79]:
print(f"There are {len(df_january.columns)} columns in January")

There are 19 columns in January


#### Q2. Computing duration

In [80]:
df_january['duration_date_time'] = (df_january['tpep_dropoff_datetime'] - df_january['tpep_pickup_datetime'])
df_january['duration_minutes'] = df_january['duration_date_time'].dt.total_seconds().div(60)

In [81]:
duration_std = df_january['duration_minutes'].std()
print(f"The standard deviation of the trips duration in January is {duration_std:.2f}")

The standard deviation of the trips duration in January is 42.59


#### Q3. Dropping outliers

In [82]:
MIN_DURATION_MIN = 1
MAX_DURATION_MIN = 60

df_january_no_outlayers = df_january.loc[
     (df_january['duration_minutes'] >= MIN_DURATION_MIN)
     & 
     (df_january['duration_minutes'] <= MAX_DURATION_MIN)
]

In [83]:
trips_count_all = len(df_january)
trips_count_without_outlayers = len(df_january_no_outlayers)

fraction_left = (trips_count_without_outlayers / trips_count_all) * 100
print(f"{fraction_left:.2f} fraction of the records left after droping the outliers.")

98.12 fraction of the records left after droping the outliers.


#### Q4. One-hot encoding

In [84]:
feature_columns = ['PULocationID',	'DOLocationID']

df_january_no_outlayers[feature_columns] = df_january_no_outlayers[feature_columns].astype(str)
df_january_no_outlayers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_january_no_outlayers[feature_columns] = df_january_no_outlayers[feature_columns].astype(str)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration_date_time,duration_minutes
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00,0 days 00:08:26,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,...,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00,0 days 00:06:19,6.316667
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,...,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00,0 days 00:12:45,12.750000
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,...,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25,0 days 00:09:37,9.616667
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,...,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00,0 days 00:10:50,10.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,...,0.00,0.5,3.96,0.0,1.0,23.76,,,0 days 00:13:59,13.983333
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,...,0.00,0.5,2.64,0.0,1.0,29.07,,,0 days 00:19:27,19.450000
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,...,0.00,0.5,5.32,0.0,1.0,26.93,,,0 days 00:24:31,24.516667
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,...,0.00,0.5,4.43,0.0,1.0,26.58,,,0 days 00:13:00,13.000000


In [85]:
train_dicts = df_january_no_outlayers[feature_columns].to_dict(orient='records')
train_dicts[:5]

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'}]

In [86]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [87]:
print(f"The dimensionality of this matrix is {X_train.shape}  with {X_train.shape[1]} number of columns.")


The dimensionality of this matrix is (3009173, 515)  with 515 number of columns.


In [88]:
target = 'duration_minutes'
y_train = df_january_no_outlayers[target].values
y_train

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

In [92]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [94]:
y_pred = lr.predict(X_train)

rmse = root_mean_squared_error(y_train, y_pred)
rmse

7.6492619241381785