<a href="https://colab.research.google.com/github/Crazydodo123/code-ml24/blob/main/cn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Importing

In [15]:
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd

cn = pd.read_csv('participant_data.csv')

cn_to_predict = cn[cn['Units'] == 0]
cn = cn[cn['Units'] != 0]
cn = cn.dropna(subset=["Units"])

X, y = cn.drop(['id', 'Date', 'Country', 'Commodity Type', 'Commodity Category', 'Units'], axis=1), cn[['Units']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [16]:
X_train.columns

Index(['Year', 'Quarter', 'Month', 'Week', 'Commodity', 'Movement'], dtype='object')

### Preprocessing

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [18]:
num_pipeline = make_pipeline(
  StandardScaler()
)

cat_pipeline = make_pipeline(
  OneHotEncoder()
)
preprocessing = ColumnTransformer([
    ('cat', cat_pipeline, ['Commodity', 'Movement']),
    ('num', num_pipeline, ['Year', 'Quarter', 'Month', 'Week'])
  ], remainder='drop')

In [19]:
X_train_prep = preprocessing.fit_transform(X_train)
X_test_prep = preprocessing.transform(X_test)

### Training

In [20]:
y_train

Unnamed: 0,Units
2445,339.0
6297,166.0
4093,44251.0
362,2313.0
5128,413.0
...,...
1463,380.0
3210,1839.0
3275,594.0
1192,1095.0


In [21]:
from xgboost import XGBRegressor

bst = XGBRegressor(learning_rate=.3, objective='reg:squarederror')
bst.fit(X_train_prep, y_train)


y_pred = bst.predict(X_test_prep)
root_mean_squared_error(y_pred, y_test)

3372.408127407826

### Tuning

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],           # Number of boosting rounds
    'learning_rate': [0.01, 0.1, 0.2],         # Step size shrinkage
    'max_depth': [3, 5, 7],                    # Maximum depth of the trees
    'min_child_weight': [1, 3, 5],             # Minimum sum of instance weight needed in a child
    'subsample': [0.6, 0.8, 1.0],              # Subsample ratio of the training instances
    'colsample_bytree': [0.6, 0.8, 1.0],       # Subsample ratio of columns when constructing each tree
    'gamma': [0, 0.1, 0.2],                    # Minimum loss reduction to make a further partition on a leaf node
}

rnd_search = GridSearchCV(bst,
    param_grid=param_grid, verbose=3, cv=3, n_jobs=-1,
    scoring='neg_root_mean_squared_error')

rnd_search.fit(X_train_prep, y_train)

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


In [26]:
rnd_search.best_params_

{'colsample_bytree': 1.0,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 100,
 'subsample': 0.8}

In [24]:
y_pred_best = rnd_search.best_estimator_.predict(X_test_prep)

roo(y_pred_best, y_test)

3135.1495639646946

In [30]:
from sklearn.metrics import r2_score, smape_score
r2 = r2_score(y_test, y_pred_best)
smape = smape_score(y_test, y_pred_best)
r2, smape

ImportError: cannot import name 'smape_score' from 'sklearn.metrics' (/usr/local/lib/python3.10/dist-packages/sklearn/metrics/__init__.py)

In [35]:
list(y_pred_best)

[3245.2947,
 640.6395,
 35.87559,
 1369.5433,
 2950.4177,
 17092.113,
 605.61475,
 26970.959,
 21126.055,
 2147.3943,
 18960.873,
 41936.39,
 32997.277,
 1558.4031,
 229.30125,
 26911.443,
 644.5301,
 29872.54,
 493.78656,
 -342.9595,
 2800.0527,
 937.67957,
 299.9068,
 2573.7263,
 -286.77063,
 23090.857,
 47798.176,
 408.9267,
 925.7604,
 102.799416,
 2452.2954,
 2426.255,
 1657.2063,
 3080.1301,
 1674.581,
 311.06543,
 -4179.83,
 30684.271,
 695.88153,
 1578.1,
 17954.021,
 181.0748,
 331.9168,
 749.8673,
 17948.436,
 17092.113,
 659.14087,
 40266.29,
 -53.995937,
 1211.5077,
 1019.9394,
 3172.4502,
 193.52316,
 2460.6606,
 1392.468,
 2075.7942,
 749.8673,
 504.03024,
 48779.176,
 266.81165,
 2318.9998,
 390.88388,
 2667.619,
 1788.4998,
 320.08612,
 368.95114,
 1292.3705,
 1307.7555,
 1866.8147,
 432.3011,
 618.9714,
 234.47491,
 2466.1604,
 1611.9335,
 49606.773,
 2437.6843,
 635.3587,
 255.19044,
 471.13907,
 2667.391,
 49321.664,
 38293.664,
 947.1691,
 20296.19,
 20418.105,
 547

In [36]:
import numpy as np

def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))
smape(y_test, list(y_pred_best))

ValueError: Unable to coerce to Series, length must be 1: given 510

In [29]:
# prompt: check smape

def smape(y_true, y_pred):
  """
  Calculate the Symmetric Mean Absolute Percentage Error (SMAPE).

  Args:
    y_true: The true values.
    y_pred: The predicted values.

  Returns:
    The SMAPE value.
  """
  denominator = (abs(y_true) + abs(y_pred)) / 2.0
  diff = abs(y_true - y_pred) / denominator
  diff[denominator == 0] = 0.0
  return 200 * (diff.mean())

smape_value = smape(y_test, y_pred_best)
print(f"SMAPE: {smape_value}")


ValueError: Unable to coerce to Series, length must be 1: given 510

### Predicting

In [39]:
X_to_predict, y_to_predict = cn.drop(['Date', 'Country', 'Commodity Type', 'Commodity Category', 'Units'], axis=1), cn_to_predict[['Units']]
X_to_predict_prep = preprocessing.transform(X_to_predict)
y_to_predict_pred = rnd_search.best_estimator_.predict(X_to_predict_prep)

pd.DataFrame(y_to_predict_pred).to_csv('cn_predictions.csv', index=False)

### Saving

In [None]:
import joblib

joblib.dump(rnd_search.best_estimator_, "./estimators/boost.pkl")