In [1]:
import os
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from test_model import df_to_X_y, score_predictions

# Данные

## Загрузка

In [2]:
if not os.path.exists('data/raw'):
    print("Downloading data.")
    !kaggle competitions download -p data/raw -c nyc-taxi-trip-duration
    !cd data/raw &&\
    unzip nyc-taxi-trip-duration.zip &&\
    unzip test.zip && unzip train.zip &&\
    rm nyc-taxi-trip-duration.zip sample_submission.zip test.zip train.zip
    !mkdir data/processed models

Downloading data.
Downloading nyc-taxi-trip-duration.zip to data/raw
 99%|█████████████████████████████████████▋| 85.0M/85.8M [00:10<00:00, 10.5MB/s]
100%|██████████████████████████████████████| 85.8M/85.8M [00:10<00:00, 8.89MB/s]
Archive:  nyc-taxi-trip-duration.zip
  inflating: sample_submission.zip   
  inflating: test.zip                
  inflating: train.zip               
Archive:  test.zip
  inflating: test.csv                
Archive:  train.zip
  inflating: train.csv               


In [3]:
df = pd.read_csv('data/raw/train.csv')
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


## Наборы данных

In [5]:
df_shuffled = shuffle(df, random_state=42)
df1, df2, df3, df_noise = np.array_split(df_shuffled, 4)
df1.shape, df2.shape, df3.shape, df_noise.shape

  return bound(*args, **kwds)


((364661, 11), (364661, 11), (364661, 11), (364661, 11))

## "Зашумленный" набор данных

In [6]:
coord_cols = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']

In [7]:
df_noise[coord_cols]

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
622538,-73.987762,40.732353,-73.981674,40.690212
198550,-73.969460,40.757278,-73.959099,40.769081
631248,-73.983398,40.749378,-73.987289,40.759529
753623,-73.977699,40.748699,-73.861633,40.768219
1429325,-73.993401,40.762383,-73.980843,40.760277
...,...,...,...,...
259178,-73.955032,40.777328,-74.006203,40.749424
1414414,-73.973618,40.763920,-73.983849,40.749874
131932,-74.006195,40.734283,-73.949608,40.785282
671155,-73.962341,40.767323,-73.969757,40.768669


In [8]:
noise_scale = 1

for col in coord_cols:
    std = df_noise[col].std()
    df_noise[col] += noise_scale * std * np.random.normal(0, 1, len(df_noise))

In [9]:
df_noise[coord_cols]

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
622538,-73.927175,40.749647,-73.930878,40.662804
198550,-74.087609,40.728989,-73.856518,40.768779
631248,-74.030694,40.798671,-74.150025,40.709730
753623,-74.071769,40.728885,-73.860678,40.772276
1429325,-74.024237,40.789404,-73.925647,40.748661
...,...,...,...,...
259178,-73.949086,40.733806,-73.885819,40.763626
1414414,-74.057195,40.757560,-73.974243,40.726597
131932,-73.736061,40.742237,-73.950497,40.787885
671155,-73.865736,40.703123,-74.123399,40.765768


## Сохранение

In [10]:
df1.to_csv('data/processed/df1.csv', index=False)
df2.to_csv('data/processed/df2.csv', index=False)
df3.to_csv('data/processed/df3.csv', index=False)
df_noise.to_csv('data/processed/df_noise.csv', index=False)

# Предобработка

In [11]:
coord_pipe = Pipeline([('norm', MinMaxScaler())])

X, y = df_to_X_y(df1)
df1_transformed = pd.DataFrame(coord_pipe.fit_transform(X, y))
df1_transformed.columns = coord_pipe.get_feature_names_out()
df1_transformed

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.360615,0.692987,0.419353,0.701337
1,0.361186,0.696616,0.418542,0.700005
2,0.360343,0.694066,0.419853,0.699858
3,0.360243,0.692373,0.418568,0.699887
4,0.376118,0.681586,0.418864,0.701181
...,...,...,...,...
364656,0.360817,0.692325,0.419434,0.698278
364657,0.362999,0.698554,0.418311,0.697258
364658,0.359947,0.696273,0.419311,0.701226
364659,0.360645,0.695748,0.419176,0.702723


# Тренировка модели линейной регрессии

In [12]:
linear_pipe = Pipeline([
    ('preprocessors', coord_pipe),
    ('model', LinearRegression())
])
X, y = df_to_X_y(df1)
linear_pipe.fit(X, y)
dump(linear_pipe, 'models/linear_pipe.joblib')

['models/linear_pipe.joblib']

In [13]:
score_predictions(df2, linear_pipe)

0.8497831001596202

In [14]:
score_predictions(df3, linear_pipe)

0.8544601681743956

In [15]:
score_predictions(df_noise, linear_pipe)

1.0222885997616282

# Тестирование модели

In [16]:
!pytest

platform linux -- Python 3.9.19, pytest-8.2.0, pluggy-1.5.0
rootdir: /home/nomad/projects/urfu-mlops/lab5
plugins: hydra-core-1.3.2
collected 3 items                                                              [0m[1m

test_model.py [32m.[0m[32m.[0m[31mF[0m[31m                                                        [100%][0m

[31m[1m_____________________________ test_mse_on_df_noise _____________________________[0m

init_pipeline = None

    [0m[94mdef[39;49;00m [92mtest_mse_on_df_noise[39;49;00m(init_pipeline):[90m[39;49;00m
        df = pd.read_csv([33m'[39;49;00m[33mdata/processed/df_noise.csv[39;49;00m[33m'[39;49;00m)[90m[39;49;00m
>       [94massert[39;49;00m score_predictions(df, linear_pipe) < THRESHOLD_RMSLE[90m[39;49;00m
[1m[31mE       AssertionError: assert 1.0222885997616293 < 0.9[0m
[1m[31mE        +  where 1.0222885997616293 = score_predictions(               id  vendor_id  ... store_and_fwd_flag trip_duration\n0       id1419212        

На зашумленном датасете проблема обнаружена.