In [1]:
import numpy as np  
import pandas as pd

In [2]:
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")
products = pd.read_csv("products.csv")
orders = pd.read_csv("orders.csv")
order_products = pd.read_csv("order_products__train.csv")

print("aisles:", aisles.shape)
print("departments:", departments.shape)
print("products:", products.shape)
print("orders:", orders.shape)
print("order_products__train:", order_products.shape)

# Quick sanity check
display(order_products.head())
display(orders.head())
display(products.head())


aisles: (134, 2)
departments: (21, 2)
products: (49688, 4)
orders: (3421083, 7)
order_products__train: (1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [3]:
df_order_product_ordprod = order_products.merge(orders, on="order_id", how="left")
df_order_product_ordprod = df_order_product_ordprod.merge( products[["product_id", "aisle_id", "department_id"]], on="product_id", how="left")

current_df = df_order_product_ordprod.copy()
current_df.head(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id
0,1,49302,1,1,112108,train,4,4,10,9.0,120,16
1,1,11109,2,1,112108,train,4,4,10,9.0,108,16
2,1,10246,3,0,112108,train,4,4,10,9.0,83,4
3,1,49683,4,0,112108,train,4,4,10,9.0,83,4
4,1,43633,5,1,112108,train,4,4,10,9.0,95,15


### Cleaning and Handling data ###

In [15]:
current_df["days_since_prior_order"] = current_df["days_since_prior_order"].fillna(0)

# user ft. 1: user reorder ratio
user_reorder_ratio = current_df.groupby("user_id")["reordered"].mean().reset_index()
user_reorder_ratio.rename(columns={"reordered":"user_reorder_ratio"}, inplace=True)

current_df = current_df.merge(user_reorder_ratio, on="user_id", how="left")

# user ft. 2: product reorder probability
product_reorder_prob = current_df.groupby("product_id")["reordered"].mean().reset_index()
product_reorder_prob.rename(columns={"reordered":"product_reorder_prob"}, inplace=True)


current_df = current_df.merge(product_reorder_prob, on="product_id", how="left")

# we will add more features for our prediction task
X = current_df.drop(columns=["reordered","eval_set"])
y = current_df["reordered"].astype(int)

X = X.astype(float).to_numpy()
y = y.astype(int).to_numpy()

### Splitting on Train/test and sampling

In [16]:
def train_test_split(X,y, test_size = 0.2, random_state = 42):
    random_gen = np.random.default_rng(seed=random_state)
    n_samples = X.shape[0]
    #shuffling the ds without changing the og array
    index = random_gen.permutation(n_samples)
    test_sample = int(n_samples * 0.2)
    test_index = index[:test_sample]
    train_index = index[test_sample:]

    return X[train_index], X[test_index], y[train_index], y[test_index]

n_sample = 40000

if X.shape[0] > n_sample:
    random_gen = np.random.default_rng(seed=42)
    sample_index = random_gen.choice(X.shape[0], size=n_sample, replace=False)
    X_small = X[sample_index]
    y_small = y[sample_index]
else:
    X_small, y_small = X,y

print("Currently using the subset with the size: ", X_small.shape[0])

X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size=0.2, random_state=42)
print("Train size: ", X_train.shape[0])
print("Test size: ", X_test.shape[0])


Currently using the subset with the size:  40000
Train size:  32000
Test size:  8000


### Feature scaling : We will do the scaling as its imp. that we will not be dependent on one of the feature.


In [17]:
def min_max_normalization(train, test):
    min_vals = train.min(axis=0)
    max_vals = train.max(axis=0)
    denom = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)

    train_scaled = (train - min_vals) / denom
    test_scaled = (test - min_vals) / denom

    return train_scaled, test_scaled, min_vals, max_vals

X_train_scaled, X_test_scaled, min_vals, max_vals = min_max_normalization(X_train, X_test)

print("Scaled feature example (first row):")
print(X_train_scaled[0])

Scaled feature example (first row):
[0.89389991 0.79050763 0.05128205 0.29401466 0.0625     1.
 1.         0.06666667 0.91729323 0.15       0.75       0.68143375]


In [18]:
class KNNClassifier:

    def __init__(self, k):
        self.k = k
        self.X_train = None
        self.y_train = None

    # for storing the training data
    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)
    
    def euclid_dist(self, x1,x2):
        return np.linalg.norm(x1-x2)
    
    def predict_one(self,x):
        # predicting the label for a single eg. x

        distance = np.linalg.norm(self.X_train - x, axis=1)

        knn_index = np.argsort(distance)[:self.k]
        knn_labels = self.y_train[knn_index]

        #Voting for the majority 0 or 1 
        one_count = np.sum(knn_labels)
        zeroes_count = len(knn_labels) - one_count

        return 1 if one_count >= zeroes_count else 0
    
    def predict(self, X):
        X = np.array(X)
        predictn = [self.predict_one(i) for i in X]
        return np.array(predictn, dtype=int)
    
    def predict_proba(self, X): #for roc_auc
        X = np.array(X)
        probabilities = []
        for x in X:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            knn_indices = np.argsort(distances)[:self.k]
            knn_labels = self.y_train[knn_indices]
            # Probability = proportion of positive neighbors
            prob = np.mean(knn_labels)
            probabilities.append(prob)
        return np.array(probabilities)



### Accuracy of our model

In [19]:
def precision_score(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    
    if tp + fp == 0:
        return 0
    return (tp / (tp + fp))

def recall_score(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    if tp + fn == 0:
        return 0.0
    return tp / (tp + fn)

def f1_score(y_true, y_pred):
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    if prec + rec == 0:
        return 0.0
    return 2 * (prec * rec) / (prec + rec)

def roc_auc_score(y_true, y_scores):
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true_sorted = y_true[sorted_indices]
    
    # Count positives and negatives
    n_pos = np.sum(y_true == 1)
    n_neg = np.sum(y_true == 0)
    
    if n_pos == 0 or n_neg == 0:
        return 0.0
    
    tp = 0
    fp = 0
    auc = 0
    prev_tp = 0
    prev_fp = 0
    
    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 1:
            tp += 1
        else:
            fp += 1
            auc += tp 
 
        if i < len(y_true_sorted) - 1 and y_scores[sorted_indices[i]] != y_scores[sorted_indices[i+1]]:
            prev_tp = tp
            prev_fp = fp
    
    return auc / (n_pos * n_neg)


In [20]:
def accuracy_score(y_true, y_pred):
    # acc = correct/total
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    return np.mean(y_true == y_pred)

### Training and Predicting the Train and Test Dataset respectively, then after evaluating it  ###

In [44]:
knn = KNNClassifier(9)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
y_proba = knn.predict_proba(X_test_scaled)


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"The accuracy on the test subset by using our KNN model is: {acc:.4f}")
print("=" * 50)
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC-AUC:   {auc:.4f}")
print("=" * 50)


The accuracy on the test subset by using our KNN model is: 0.7508
Accuracy:  0.7508
Precision: 0.7642
Recall:    0.8448
F1 Score:  0.8025
ROC-AUC:   0.8121


In [42]:
baseline_acc = np.mean(y_test == 1)
print(f"Baseline 'always 1' accuracy: {baseline_acc:.4f}")

Baseline 'always 1' accuracy: 0.5994


### Cross Validation for best possible k value ###

In [26]:
from sklearn.model_selection import StratifiedKFold

# This is just for finding the best possible k value and also performing stratified k fold
# croos validation so we can test it on different training and testing dataset. This will return our mean accuracy, f1 score
# and the ROC area under curve value to see how good the model is.
def crossValidation(k, X_train, y_train, n_splits=5):
    
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    acc_scores = []
    f1_scores = []
    auc_scores = []

    for train_idx, val_idx in kfold.split(X_train, y_train):
        
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        
        y_tr, y_val = y_train[train_idx], y_train[val_idx]

        knn = KNNClassifier(k=k)
        
        knn.fit(X_tr, y_tr)
        
        y_val_pred = knn.predict(X_val)

        y_val_scores = y_val_pred.astype(float)

        acc = accuracy_score(y_val, y_val_pred)
        f1  = f1_score(y_val, y_val_pred)

        
        acc_scores.append(acc)
        f1_scores.append(f1)
        
        try:
            auc = roc_auc_score(y_val, y_val_scores)
        except ValueError:
            auc = 0.0

        auc_scores.append(auc)

    return np.mean(acc_scores), np.mean(f1_scores), np.mean(auc_scores)


# This is some of the k values our team has considered (the list is short because of computation cost, it increases with k value increase)
k_values = [1, 3, 5, 7, 9, 11, 13, 15, 21, 31]

results = []

print("For the best k we will take the mean of that best k's evaluation metrics (for each) through k fold \n")
for k in k_values:
    mean_acc, mean_f1, mean_auc = crossValidation(k, X_train_scaled, y_train)
    results.append((k, mean_acc, mean_f1, mean_auc))
    print(f"best k = {k:2d} | "f"mean Accuracy = {mean_acc:.4f} | "f"mean F1 = {mean_f1:.4f} | "f"mean ROC-AUC = {mean_auc:.4f}")


# we will take the best evaluation metrics based on our 5 fold cross validation for each of our k values
# which is then computed by taking mean of it.
best_k, best_acc, best_f1, best_auc = max(results, key=lambda x: x[3])

print("\n###############################################")
print(f"Best k value based on cross validation: k = {best_k}")
print(f"Mean Accuracy:  {best_acc:.4f}")
print(f"Mean F1:        {best_f1:.4f}")
print(f"Mean ROC-AUC:   {best_auc:.4f}")
print("#################################################\n")


# Now that we got the best k value, we will train our model with this k value on our train set
# and then predict for our test dataset.
best_knn = KNNClassifier(k=best_k)
best_knn.fit(X_train_scaled, y_train)

y_test_pred = best_knn.predict(X_test_scaled)

y_test_scores = y_test_pred.astype(float)
y_test_proba = best_knn.predict_proba(X_test_scaled)


test_acc  = accuracy_score(y_test, y_test_pred)
test_prec = precision_score(y_test, y_test_pred)
test_rec  = recall_score(y_test, y_test_pred)
test_f1   = f1_score(y_test, y_test_pred)

try:
    test_auc = roc_auc_score(y_test, y_test_scores)
except ValueError:
    test_auc = 0.0

test_auc_proba = roc_auc_score(y_test, y_test_proba)

print("Evaluation of our model with best possible k value")
print("---------------------------------------------------------")
print(f"Best k value: {best_k}")
print(f"Accuracy:               {test_acc:.4f}")
print(f"Precision:              {test_prec:.4f}")
print(f"Recall:                 {test_rec:.4f}")
print(f"F1 Score:               {test_f1:.4f}")
print(f"ROC-AUC:                {test_auc:.4f}")
print(f"ROC-AUC (probability scores):   {test_auc_proba:.4f}")
print("---------------------------------------------------------")


Tuning k for scratch KNN using cross-validation (best ROC-AUC)...

k =  1 | CV Accuracy = 0.6849 | CV F1 = 0.7386 | CV ROC-AUC = 0.6701
k =  3 | CV Accuracy = 0.7198 | CV F1 = 0.7727 | CV ROC-AUC = 0.7011
k =  5 | CV Accuracy = 0.7368 | CV F1 = 0.7883 | CV ROC-AUC = 0.7164
k =  7 | CV Accuracy = 0.7469 | CV F1 = 0.7976 | CV ROC-AUC = 0.7256
k =  9 | CV Accuracy = 0.7498 | CV F1 = 0.8005 | CV ROC-AUC = 0.7279
k = 11 | CV Accuracy = 0.7543 | CV F1 = 0.8048 | CV ROC-AUC = 0.7317
k = 13 | CV Accuracy = 0.7564 | CV F1 = 0.8070 | CV ROC-AUC = 0.7332
k = 15 | CV Accuracy = 0.7571 | CV F1 = 0.8077 | CV ROC-AUC = 0.7336
k = 21 | CV Accuracy = 0.7602 | CV F1 = 0.8110 | CV ROC-AUC = 0.7358
k = 31 | CV Accuracy = 0.7639 | CV F1 = 0.8147 | CV ROC-AUC = 0.7386

Best k based on mean CV ROC-AUC: k = 31
Mean CV Accuracy:  0.7639
Mean CV F1:        0.8147
Mean CV ROC-AUC:   0.7386

Final evaluation of scratch KNN with best_k on TEST set
---------------------------------------------------------
Best k (b

In [45]:
# The below output cell is included above, you can see this in the above cell if you rerun the whole code, the below is the main ROC-AUC 
# probability scores for our model

ROC-AUC (probability scores):   0.8354


### Other Predictive Models for Comparison ###

In [49]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# this is for training the models and printing their evaluation metrics
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    
    print(f"Training model: {name}")    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test, y_pred)
    f1   = f1_score(y_test, y_pred)
    
    auc = roc_auc_score(y_test, y_proba)
    
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"ROC-AUC:   {auc:.4f}")
    
    print("=" * 60)
    print()

lr = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced"
)

rf = RandomForestClassifier(
    n_estimators=600,
    random_state=42,
    n_jobs=-1
)

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42,
    n_jobs=-1,
)

models = {
    "Logistic Regression": lr,
    "Random Forest": rf,
    "XGBoost": xgb
}


for name, model in models.items():
    evaluate_model(name, model, X_train_scaled, X_test_scaled, y_train, y_test)

Training model: Logistic Regression
Accuracy:  0.7734
Precision: 0.8342
Recall:    0.7762
F1 Score:  0.8041
ROC-AUC:   0.8607

Training model: Random Forest
Accuracy:  0.7800
Precision: 0.8019
Recall:    0.8407
F1 Score:  0.8208
ROC-AUC:   0.8594

Training model: XGBoost
Accuracy:  0.7825
Precision: 0.8052
Recall:    0.8405
F1 Score:  0.8224
ROC-AUC:   0.8658

