# Entrenamiento

## Librerias necesarias

In [14]:
# Data Processing
import pandas as pd

# Modelling
from sklearn.ensemble import RandomForestClassifier
import joblib
import pickle

## Cargo datos de entrenamiento

In [15]:
taxi = pd.read_parquet('./data/yellow_tripdata_2020-01.parquet')

In [18]:
taxi.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2020-01-01 00:28:15,2020-01-01 00:33:03,1.0,1.2,1.0,N,238,239,1,6.0,3.0,0.5,1.47,0.0,0.3,11.27,2.5,
1,1,2020-01-01 00:35:39,2020-01-01 00:43:04,1.0,1.2,1.0,N,239,238,1,7.0,3.0,0.5,1.5,0.0,0.3,12.3,2.5,
2,1,2020-01-01 00:47:41,2020-01-01 00:53:52,1.0,0.6,1.0,N,238,238,1,6.0,3.0,0.5,1.0,0.0,0.3,10.8,2.5,
3,1,2020-01-01 00:55:23,2020-01-01 01:00:14,1.0,0.8,1.0,N,238,151,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,0.0,
4,2,2020-01-01 00:01:58,2020-01-01 00:04:16,1.0,0.0,1.0,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,0.0,


## Características para la clasificación

In [4]:
numeric_feat = [
    "pickup_weekday",
    "pickup_hour",
    'work_hours',
    "pickup_minute",
    "passenger_count",
    'trip_distance',
    'trip_time',
    'trip_speed'
]

categorical_feat = [
    "PULocationID",
    "DOLocationID",
    "RatecodeID",
]

features = numeric_feat + categorical_feat

EPS = 1e-7

## Defino el preprocesamiento de la data

In [5]:
def preprocess(df, target_col):

   # Basic cleaning
   df = df[df['fare_amount'] > 0].reset_index(drop=True)  # avoid divide-by-zero
   
   # add target
   df['tip_fraction'] = df['tip_amount'] / df['fare_amount']
   df[target_col] = df['tip_fraction'] > 0.2

   # add features
   df['pickup_weekday'] = df['tpep_pickup_datetime'].dt.weekday
   df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
   df['pickup_minute'] = df['tpep_pickup_datetime'].dt.minute
   df['work_hours'] = (df['pickup_weekday'] >= 0) & (df['pickup_weekday'] <= 4) & (df['pickup_hour'] >= 8) & (df['pickup_hour'] <= 18)
   df['trip_time'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.seconds
   df['trip_speed'] = df['trip_distance'] / (df['trip_time'] + EPS)

   # drop unused columns
   df = df[['tpep_dropoff_datetime'] + features + [target_col]]
   df[features + [target_col]] = df[features + [target_col]].astype("float32").fillna(-1.0)

   # convert target to int32 for efficiency (it's just 0s and 1s)
   df[target_col] = df[target_col].astype("int32")

   return df.reset_index(drop=True)


## Ejecuto el preprocesamiento de la data

In [14]:
# Defino variable objetivo
target_col = "high_tip"

# Realizo llamada a la funcion de pre procesamiento
taxi_train = preprocess(df=taxi, target_col=target_col)

## Entrenamiento del modelo

In [13]:
# Creo un modelo Random Forest con 100 árboles, cada uno con una profundidad máxima de 10
rfc = RandomForestClassifier(n_estimators=100, max_depth=10)

# Inicio el entrenamiento del modelo Random Forest
rfc.fit(taxi_train[features], taxi_train[target_col])

CPU times: total: 1min 35s
Wall time: 13min 38s


## Exportamos el modelo a un archivo joblib

In [15]:
joblib.dump(rfc, "./random_forest.joblib")

['./random_forest.joblib']

## Guardamos el modelo en un archivo pickle

In [16]:
with open('./random_forest.pkl', 'wb') as f:
    pickle.dump(rfc, f)