In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

df = pd.read_feather('mined_data.feather')

# Drop columns not needed
df.drop(['order_id', 'order_number', 'order_dow', 'order_hour_of_day',
         'reordered', 'product_name', 'days_since_user_first_order'], axis=1, inplace=True)

df = df[df['days_since_user_ordered_product'] >= 0]

df['reordered_within_30_days'] = (df['days_since_user_ordered_product'] <= 30).astype(int)

df.drop(['days_since_user_ordered_product'], axis=1, inplace=True)

X = df.drop('reordered_within_30_days', axis=1)
y = df['reordered_within_30_days']

# Creating a smaller validation set to try different models
validation_df = df.sample(frac=0.1, random_state=96)
val_X = validation_df.drop('reordered_within_30_days', axis=1)
val_y = validation_df['reordered_within_30_days']

X_train, X_test, y_train, y_test = train_test_split(val_X, val_y, test_size=0.2, random_state=21)

In [2]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.32      0.15      0.21    101411
           1       0.74      0.88      0.81    281120

    accuracy                           0.69    382531
   macro avg       0.53      0.52      0.51    382531
weighted avg       0.63      0.69      0.65    382531



In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.37      0.28      0.32    101411
           1       0.76      0.83      0.79    281120

    accuracy                           0.68    382531
   macro avg       0.56      0.55      0.56    382531
weighted avg       0.66      0.68      0.67    382531



In [4]:
from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.52      0.09      0.15    101411
           1       0.75      0.97      0.84    281120

    accuracy                           0.74    382531
   macro avg       0.63      0.53      0.50    382531
weighted avg       0.69      0.74      0.66    382531



In [5]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.17      0.25    101411
           1       0.76      0.93      0.83    281120

    accuracy                           0.73    382531
   macro avg       0.61      0.55      0.54    382531
weighted avg       0.68      0.73      0.68    382531



In [6]:
from xgboost import XGBClassifier

xgb = XGBClassifier(device='cuda')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


              precision    recall  f1-score   support

           0       0.56      0.12      0.20    101411
           1       0.75      0.97      0.85    281120

    accuracy                           0.74    382531
   macro avg       0.66      0.54      0.53    382531
weighted avg       0.70      0.74      0.68    382531



XGBoost and Logistic Regression have the best accuracy. Since XGBoost was faster, I'm going to use that.