In [1]:
!pip -q install jcopml 

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

In [4]:
df = pd.read_csv('/kaggle/input/taxi-fare/taxi_fare.csv', parse_dates=['pickup_datetime']) # di lakukan parse tanggal dan waktu agar lebih mudah memecah ke data tanggal dan waktu

pickup_dt = df.pickup_datetime.dt # terdapat .dt karena sudah dilakukan parse pada saat melakukan read data
df['year'] = pickup_dt.year
df['month'] = pickup_dt.month
df['day'] = pickup_dt.dayofweek
df['hour'] = pickup_dt.hour
df.drop(columns='pickup_datetime', inplace=True)

df['distance'] = np.abs(df.pickup_longitude - df.dropoff_longitude) + np.abs(df.pickup_latitude - df.dropoff_latitude)
df.drop(columns=['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], inplace=True)

df.head()

Unnamed: 0,fare_amount,passenger_count,year,month,day,hour,distance
0,4.5,1.0,2009,6,0,17,0.011742
1,16.9,1.0,2010,1,1,16,0.107481
2,5.7,2.0,2011,8,3,0,0.019212
3,7.7,1.0,2012,4,5,4,0.029386
4,5.3,1.0,2010,3,1,7,0.027194


# Feature Engineering

### Menghapus data yang aneh

In [6]:
df = df[(df.distance > 0) & (df.fare_amount > 0)] # menghapus jarak dan tarif di bawah nol

# Correlation Matrix -> For Numeric Data

1. **pearson** 

untuk machine learning **disarankan** menggunakan ini karena merupakan korelasi linear khususnya pada model regresi
- linear relationship
- syarat: berdistribusi normal

2. **pearson**

`pearson` yang sudah di `normalize`, **sangat disarankan** untuk digunakan pada machine learning

2. **spearman** 

**tidak disarankan** untuk machine learning, harusnya digunakan pada kasus statistik lain
- rank relationship
- syarat: subjek yang berbeda
- contoh: Nilai yang diberikan oleh Guru A vs Guru B
- contoh: Harga saham A vs Harga saham B

3. **kendall** 

**tidak disarankan** untuk machine learning, harusnya digunakan pada kasus statistik lain
- rank relationship
- syarat: subjek yang sama
- contoh: korelasi nilai quiz vs nilai ujian untuk murid A

In [7]:
from jcopml.plot import plot_correlation_matrix

In [8]:
plot_correlation_matrix(df, 'fare_amount', numeric_col=['passenger_count', 'year', 'distance'])

interactive(children=(ToggleButtons(description='method', options=('spearman', 'kendall', 'pearson', 'pearson_…

### Cara Kerja

**Feature Correlation** = Jika terdapat `dua feature` yang saling `berkorelasi kuat`, salah satu boleh di `buang`

**Target Correlation** = Jika terdapat feature yang `berkorelasi kuat` dengan `target`, maka feature tersebut `digunakan`

**Range Korelasi** yaitu antara `-1 sampai 1`, semakin `mendekati` -1 atau 1 maka semakin `kuat` korelasi, `minus` berarti `berkorelasi terbalik`

# Dataset splitting

In [10]:
X = df.drop(columns=['fare_amount'])
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
X_train.shape, X_test.shape, X_train.shape, y_train.shape

((14706, 6), (3677, 6), (14706, 6), (14706,))

# Training

In [12]:
from xgboost import XGBRegressor

from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [13]:
# Preprocessor
preprocessor = ColumnTransformer([
    ('numeric', num_pipe(), ['passenger_count', 'distance', 'year']), # XGBoost tidak perlu di scaling, karena merupakan model yang tree-base  
    ('categoric', cat_pipe(encoder='onehot'), ['month', 'day', 'hour']) 
])

# Pipeline
xgb_pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', XGBRegressor(n_jobs=-1, random_state=42))
])

# Parameter Tunning
model = RandomizedSearchCV(xgb_pipeline, rsp.xgb_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

# Evaluation
print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'algo__colsample_bytree': 0.5261245937025092, 'algo__gamma': 1, 'algo__learning_rate': 0.08498604636076731, 'algo__max_depth': 5, 'algo__n_estimators': 122, 'algo__reg_alpha': 0.0015568103018717575, 'algo__reg_lambda': 6.2601238645330595, 'algo__subsample': 0.7433401936490238}
0.8550883698455869 0.7938483320321251 0.7634767082333894


Parameter `xgb__reg_alpha` dan `xgb__reg_lambda` merupakan regularization, yaitu **L1** dan **L2** 

Jadi, XGBoost memiliki `default regularization`, jadi jika `overfit` tinggal mengakali regularization tersebut

# Masih banyak yang bisa diimprove

Mari kita belajar dari peserta lain di kaggle

https://www.kaggle.com/code/breemen/nyc-taxi-fare-data-exploration

- Ternyata ada data yang salat (terletak di air)
- Analisis lokasi penjemputan, apakah di landmark tertentu seperti airport
- dll