# Starter and Homework

## Starter

In [14]:
import pickle
import pandas as pd

In [28]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

In [16]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [21]:
year = 2021
month = 2
taxi_type = 'fhv'

input_file = f'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/{taxi_type}/{year:04d}-{month:02d}.parquet'

df = read_data(input_file)

In [18]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

## Homework

### Q1. Notebook

In [19]:
df['pred'] = y_pred
df['pred'].mean()

16.191691679979066

### Q2. Preparing the output

In [23]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_result = df[['ride_id', 'pred']].copy()

In [25]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

### Q3. Creating the scoring script

In [27]:
!jupyter nbconvert --to script starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to script
[NbConvertApp] Writing 1411 bytes to starter.py


### Q4. Virtual environment

"scikit-learn": { <br>
            "hashes": [ <br>
                "sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b", <br>

### Q5. Parametrize the script

In [29]:
df_q5 = pd.read_parquet('output/fhv/2021-03.parquet')
df_q5['pred'].mean()

16.298821614015107

### Q6. Docker container

In [None]:
FROM agrigorev/zoomcamp-model:mlops-3.9.7-slim

RUN pip install -U pip
RUN pip install pipenv

WORKDIR /app

COPY [ "Pipfile", "Pipfile.lock",  "./" ]

RUN pipenv install --system --deploy

COPY [ "starter.py", "starter.py" ]

RUN mkdir -p output/fhv

RUN python starter.py fhv 2021 04