In [49]:
# Importing essential libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

In [34]:
# Load datasets

orders = pd.read_csv('orders.csv')
order_products = pd.read_csv('order_products__prior.csv')
order_products_train = pd.read_csv('order_products__train.csv')
products = pd.read_csv('products.csv')
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')

In [3]:
# Merge datasets

merged_df = orders.merge(order_products, on='order_id').merge(products, on='product_id').merge(aisles, on='aisle_id').merge(departments, on='department_id')
merged_df

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,soft drinks,beverages
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,soft drinks,beverages
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,soft drinks,beverages
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,soft drinks,beverages
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,soft drinks,beverages
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32434484,2939884,130327,prior,3,1,13,0.0,27414,5,0,Organic Emmer Farro,68,10,bulk grains rice dried goods,bulk
32434485,3267360,150683,prior,1,5,16,,27414,13,0,Organic Emmer Farro,68,10,bulk grains rice dried goods,bulk
32434486,414077,159238,prior,9,6,12,8.0,27414,1,0,Organic Emmer Farro,68,10,bulk grains rice dried goods,bulk
32434487,813243,171675,prior,13,2,6,13.0,27414,34,0,Organic Emmer Farro,68,10,bulk grains rice dried goods,bulk


In [4]:
# Feature Engineering

customer_features1 = merged_df.groupby('user_id').agg({
    'order_id': 'nunique',  # Order Frequency
    'product_id': 'count',  # Total products purchased
    'reordered': 'mean',    # Reorder ratio
    'aisle_id': 'nunique',  # Aisle diversity
}).rename(columns={
    'order_id': 'order_frequency',
    'product_id': 'total_products',
    'reordered': 'reorder_ratio',
    'aisle_id': 'aisle_diversity',
})

customer_features1

Unnamed: 0_level_0,order_frequency,total_products,reorder_ratio,aisle_diversity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,10,59,0.694915,12
2,14,195,0.476923,33
3,12,88,0.625000,16
4,5,18,0.055556,14
5,4,37,0.378378,16
...,...,...,...,...
206205,3,32,0.250000,17
206206,67,285,0.473684,50
206207,16,223,0.587444,46
206208,49,677,0.707533,63


In [5]:
# Choose valuable features

customer_features2 = merged_df.groupby('user_id').agg({
    'days_since_prior_order': 'mean',  # Average days between orders
    'order_number': lambda x: np.mean(x.value_counts()),  # Average order size
    'product_id': 'nunique',  # Unique products ordered
}).rename(columns={
    'days_since_prior_order': 'avg_days_between_orders',
    'order_number': 'avg_order_size',
    'product_id': 'unique_products_ordered'
})

customer_features2

Unnamed: 0_level_0,avg_days_between_orders,avg_order_size,unique_products_ordered
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,20.259259,5.900000,18
2,15.967033,13.928571,102
3,11.487179,7.333333,33
4,15.357143,3.600000,17
5,14.500000,9.250000,23
...,...,...,...
206205,20.666667,10.666667,24
206206,4.042705,4.253731,150
206207,14.879397,13.937500,92
206208,7.442105,13.816327,198


In [6]:
# Joined the two dataframes
customer_features_combined = customer_features1.join(customer_features2)

# Displaying the resulting dataframe
customer_features_combined

Unnamed: 0_level_0,order_frequency,total_products,reorder_ratio,aisle_diversity,avg_days_between_orders,avg_order_size,unique_products_ordered
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,10,59,0.694915,12,20.259259,5.900000,18
2,14,195,0.476923,33,15.967033,13.928571,102
3,12,88,0.625000,16,11.487179,7.333333,33
4,5,18,0.055556,14,15.357143,3.600000,17
5,4,37,0.378378,16,14.500000,9.250000,23
...,...,...,...,...,...,...,...
206205,3,32,0.250000,17,20.666667,10.666667,24
206206,67,285,0.473684,50,4.042705,4.253731,150
206207,16,223,0.587444,46,14.879397,13.937500,92
206208,49,677,0.707533,63,7.442105,13.816327,198


In [7]:
# Defined function to standarize(normalise) the data

def standard_scaler(data):
    mean = np.mean(data, axis=0)
    std_dev = np.std(data, axis=0)
    scaled_data = (data - mean) / std_dev
    return scaled_data, mean, std_dev

customer_features_scaled, mean, std_dev = standard_scaler(customer_features_combined.values)

print('Scaled_customer_features:', customer_features_scaled, sep = '\n')
#print('\nmean:',mean, end='\n')
#print('\nstandard deviation:',std_dev)

Scaled_customer_features:
[[-0.33566236 -0.48132062  1.23815226 ...  0.66453607 -0.69097766
  -0.82230831]
 [-0.0954904   0.18466785  0.21058293 ...  0.06900713  0.67825489
   0.66199514]
 [-0.21557638 -0.33930838  0.90858644 ... -0.55255432 -0.44652986
  -0.55725412]
 ...
 [ 0.02459559  0.32178313  0.73155509 ... -0.08189794  0.67977761
   0.48529235]
 [ 2.00601431  2.54500936  1.29763077 ... -1.11379192  0.65911209
   2.35834195]
 [-0.15553339 -0.13853244  0.19146918 ...  0.66683964 -0.00486209
   0.06120565]]


In [8]:
# Check mean and standard deviation of scaled data

scaled_mean = np.mean(customer_features_scaled, axis=0)
scaled_std_dev = np.std(customer_features_scaled, axis=0)

print('Scaled Data Mean:\n', scaled_mean)
print('\nScaled Data Standard Deviation:', scaled_std_dev)

Scaled Data Mean:
 [-3.10116659e-18  3.08049215e-17  1.59055389e-16  1.46788552e-17
 -1.10194786e-16  6.65889383e-17 -8.21809147e-17]

Scaled Data Standard Deviation: [1. 1. 1. 1. 1. 1. 1.]


In [9]:
# Defined function for KMeans with pre-assigned k

def kmeans_clustering(data, k, max_iter=300):
    centroids = data[np.random.choice(data.shape[0], k, replace=False)]
    for _ in range(max_iter):
        clusters = np.argmin(np.linalg.norm(data[:, np.newaxis] - centroids, axis=2), axis=1)
        new_centroids = np.array([data[clusters == i].mean(axis=0) for i in range(k)])
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
    return clusters

In [10]:
# Defined function for PCA

def pca_manual(data, n_components):
    # Center the data by subtracting the mean of each feature
    data_meaned = data - np.mean(data, axis=0)
    
    # Calculate the covariance matrix of the centered data
    covariance_matrix = np.cov(data_meaned, rowvar=False)
    
    # Calculate the eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    
    # Sort the eigenvalues in descending order and get the indices
    sorted_indices = np.argsort(eigenvalues)[::-1]
    # Sort the eigenvalues using the indices
    sorted_eigenvalues = eigenvalues[sorted_indices]
    # Sort the eigenvectors using the indices
    sorted_eigenvectors = eigenvectors[:, sorted_indices]
    
    # Select the first n_components eigenvectors
    eigenvector_subset = sorted_eigenvectors[:, :n_components]
    
    # Transform the data to the new space defined by the selected eigenvectors
    transformed_data = -np.dot(data_meaned, eigenvector_subset)
    
    return transformed_data

In [11]:
customer_features_pca = pca_manual(customer_features_scaled, 2)
customer_features_pca

array([[ 1.11369793,  1.03055661],
       [-0.68888321, -0.66158306],
       [ 0.47089578,  1.1430851 ],
       ...,
       [-1.27087451, -0.55741057],
       [-4.86858633,  0.04568733],
       [-0.1173058 , -0.50680289]])

In [12]:
k = 4
clusters = kmeans_clustering(customer_features_pca, k)

# Create a copy of the original dataframe
customer_features_with_clusters = customer_features_combined.copy()

# Add cluster labels to the customer features
customer_features_with_clusters['cluster'] = clusters
customer_features_with_clusters

Unnamed: 0_level_0,order_frequency,total_products,reorder_ratio,aisle_diversity,avg_days_between_orders,avg_order_size,unique_products_ordered,cluster
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,10,59,0.694915,12,20.259259,5.900000,18,1
2,14,195,0.476923,33,15.967033,13.928571,102,2
3,12,88,0.625000,16,11.487179,7.333333,33,3
4,5,18,0.055556,14,15.357143,3.600000,17,1
5,4,37,0.378378,16,14.500000,9.250000,23,1
...,...,...,...,...,...,...,...,...
206205,3,32,0.250000,17,20.666667,10.666667,24,1
206206,67,285,0.473684,50,4.042705,4.253731,150,0
206207,16,223,0.587444,46,14.879397,13.937500,92,2
206208,49,677,0.707533,63,7.442105,13.816327,198,0


In [13]:
customer_features_combined1 = customer_features_with_clusters.merge(merged_df[['user_id', 'product_id', 'reordered', 'order_number']], on='user_id')
customer_features_combined1

Unnamed: 0,user_id,order_frequency,total_products,reorder_ratio,aisle_diversity,avg_days_between_orders,avg_order_size,unique_products_ordered,cluster,product_id,reordered,order_number
0,1,10,59,0.694915,12,20.259259,5.900000,18,1,196,0,1
1,1,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,2
2,1,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,3
3,1,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,4
4,1,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
32434484,206209,13,129,0.472868,41,20.275862,9.923077,68,2,40534,0,1
32434485,206209,13,129,0.472868,41,20.275862,9.923077,68,2,40534,1,3
32434486,206209,13,129,0.472868,41,20.275862,9.923077,68,2,14197,0,6
32434487,206209,13,129,0.472868,41,20.275862,9.923077,68,2,14197,1,13


In [14]:
# Prepare train and test data
train = orders[orders.eval_set == 'train']

In [15]:

train_dropped = train.drop(columns='order_number')
train_dropped

Unnamed: 0,order_id,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,train,4,8,14.0
25,1492625,2,train,1,11,30.0
49,2196797,5,train,0,11,6.0
74,525192,7,train,2,11,6.0
78,880375,8,train,1,14,10.0
...,...,...,...,...,...,...
3420838,2585586,206199,train,2,16,30.0
3420862,943915,206200,train,6,19,6.0
3420924,2371631,206203,train,4,19,30.0
3420933,1716008,206205,train,1,16,10.0


In [16]:
test = orders[orders.eval_set == 'test']

In [17]:
test_dropped = test.drop(columns='order_number')
test_dropped

Unnamed: 0,order_id,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,5,15,11.0
44,329954,4,test,3,12,30.0
53,1528013,6,test,3,16,22.0
96,1376945,11,test,6,11,8.0
102,1356845,12,test,1,20,30.0
...,...,...,...,...,...,...
3420918,2728930,206202,test,2,17,6.0
3420929,350108,206204,test,4,14,14.0
3421001,1043943,206206,test,0,20,0.0
3421018,2821651,206207,test,2,13,14.0


In [18]:
# Prepare merged DataFrames
train_merged = pd.merge(train_dropped, customer_features_combined1, on='user_id')
train_merged

Unnamed: 0,order_id,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order,order_frequency,total_products,reorder_ratio,aisle_diversity,avg_days_between_orders,avg_order_size,unique_products_ordered,cluster,product_id,reordered,order_number
0,1187899,1,train,4,8,14.0,10,59,0.694915,12,20.259259,5.900000,18,1,196,0,1
1,1187899,1,train,4,8,14.0,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,2
2,1187899,1,train,4,8,14.0,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,3
3,1187899,1,train,4,8,14.0,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,4
4,1187899,1,train,4,8,14.0,10,59,0.694915,12,20.259259,5.900000,18,1,196,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20641986,272231,206209,train,6,14,30.0,13,129,0.472868,41,20.275862,9.923077,68,2,40534,0,1
20641987,272231,206209,train,6,14,30.0,13,129,0.472868,41,20.275862,9.923077,68,2,40534,1,3
20641988,272231,206209,train,6,14,30.0,13,129,0.472868,41,20.275862,9.923077,68,2,14197,0,6
20641989,272231,206209,train,6,14,30.0,13,129,0.472868,41,20.275862,9.923077,68,2,14197,1,13


In [19]:
test_merged = pd.merge(test_dropped, customer_features_combined1, on='user_id')
test_merged

Unnamed: 0,order_id,user_id,eval_set,order_dow,order_hour_of_day,days_since_prior_order,order_frequency,total_products,reorder_ratio,aisle_diversity,avg_days_between_orders,avg_order_size,unique_products_ordered,cluster,product_id,reordered,order_number
0,2774568,3,test,5,15,11.0,12,88,0.625000,16,11.487179,7.333333,33,3,1005,0,10
1,2774568,3,test,5,15,11.0,12,88,0.625000,16,11.487179,7.333333,33,3,23650,0,2
2,2774568,3,test,5,15,11.0,12,88,0.625000,16,11.487179,7.333333,33,3,23650,1,12
3,2774568,3,test,5,15,11.0,12,88,0.625000,16,11.487179,7.333333,33,3,17668,0,1
4,2774568,3,test,5,15,11.0,12,88,0.625000,16,11.487179,7.333333,33,3,17668,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11792493,803273,206208,test,5,11,4.0,49,677,0.707533,63,7.442105,13.816327,198,0,39578,0,8
11792494,803273,206208,test,5,11,4.0,49,677,0.707533,63,7.442105,13.816327,198,0,39578,1,12
11792495,803273,206208,test,5,11,4.0,49,677,0.707533,63,7.442105,13.816327,198,0,39578,1,14
11792496,803273,206208,test,5,11,4.0,49,677,0.707533,63,7.442105,13.816327,198,0,48865,0,42


In [20]:
# Define the features to use
features_to_use = [
    'order_frequency', 'total_products', 'reorder_ratio', 'aisle_diversity',
    'avg_days_between_orders', 'avg_order_size', 'unique_products_ordered',
    'cluster', 'order_number',
    'order_hour_of_day', 'days_since_prior_order'
]

In [21]:
X_train = train_merged[features_to_use]
X_train

Unnamed: 0,order_frequency,total_products,reorder_ratio,aisle_diversity,avg_days_between_orders,avg_order_size,unique_products_ordered,cluster,order_number,order_hour_of_day,days_since_prior_order
0,10,59,0.694915,12,20.259259,5.900000,18,1,1,8,14.0
1,10,59,0.694915,12,20.259259,5.900000,18,1,2,8,14.0
2,10,59,0.694915,12,20.259259,5.900000,18,1,3,8,14.0
3,10,59,0.694915,12,20.259259,5.900000,18,1,4,8,14.0
4,10,59,0.694915,12,20.259259,5.900000,18,1,5,8,14.0
...,...,...,...,...,...,...,...,...,...,...,...
20641986,13,129,0.472868,41,20.275862,9.923077,68,2,1,14,30.0
20641987,13,129,0.472868,41,20.275862,9.923077,68,2,3,14,30.0
20641988,13,129,0.472868,41,20.275862,9.923077,68,2,6,14,30.0
20641989,13,129,0.472868,41,20.275862,9.923077,68,2,13,14,30.0


In [22]:
y_train = train_merged['reordered']
y_train

0           0
1           1
2           1
3           1
4           1
           ..
20641986    0
20641987    1
20641988    0
20641989    1
20641990    0
Name: reordered, Length: 20641991, dtype: int64

In [25]:
# Convert to DMatrix
d_train = xgb.DMatrix(X_train, y_train)
d_test = xgb.DMatrix(test_merged[features_to_use])

In [26]:
# Set parameters
xgb_params = {
    "objective": "reg:logistic",
    "eval_metric": "logloss",
    "eta": 0.1,
    "max_depth": 6,
    "min_child_weight": 10,
    "gamma": 0.70,
    "subsample": 0.76,
    "colsample_bytree": 0.95,
    "alpha": 2e-05,
    "lambda": 10
}

# Train model
watchlist = [(d_train, "train")]
num_round = 80

bst = xgb.train(params=xgb_params, dtrain=d_train, num_boost_round=num_round, evals=watchlist, verbose_eval=10)

[0]	train-logloss:0.65339
[10]	train-logloss:0.55631
[20]	train-logloss:0.53601
[30]	train-logloss:0.53047
[40]	train-logloss:0.52847
[50]	train-logloss:0.52775
[60]	train-logloss:0.52739
[70]	train-logloss:0.52718
[79]	train-logloss:0.52709


In [46]:
# Predict
preds = bst.predict(d_test)

# Apply threshold
threshold = 0.5
test_merged['pred'] = preds

# Generate submission file
d = dict()
for row in test_merged.itertuples():
    if row.pred > threshold:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test.order_id:
    if order not in d:
        d[order] = 'None'
    else:
        d[order] = ' '.join(sorted(set(d[order].split()), key=int))

In [47]:
# Prepare submission
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']

# Save submission
sub.to_csv('sub1.csv', index=False)

In [48]:
sub1 = pd.read_csv('sub1.csv')
sub1.head(10)

Unnamed: 0,order_id,products
0,2774568,1005 1819 7503 9387 12845 14992 16797 16965 17...
1,2161313,196 1747 10441 11266 12427 14715 27839 30292 3...
2,1980631,6184 9387 13575 13914 22362 41400 46061
3,139655,2452 4217 5212 5450 7088 7948 8518 13176 14678...
4,2940603,4091 9422 10339 14947 17859 18531 19894 22849 ...
5,1192143,34 3162 5077 5479 5973 6292 8424 8503 8518 852...
6,3222866,455 2514 2516 7969 8501 13187 14283 14947 1571...
7,707453,210 694 2838 2846 4605 4942 5212 6111 6384 796...
8,1320132,1090 1529 1654 2426 4406 7367 7673 11079 11223...
9,882556,95 1870 3376 3583 5373 5450 6567 6860 7371 817...
