# Clustering analysis using PCA


What you will find in here:
- [Clustering Analysis](#Clustering)
- [Variance](#Variance)

In [1]:
#Load all my imports necessary 

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import random

from xgboost import XGBClassifier

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, f1_score
from sklearn.cluster import KMeans, DBSCAN

%matplotlib inline

In [None]:
order_products_train= pd.read_csv("../data/order_products__train.csv")
order_products_prior = pd.read_csv("../data/order_products__prior.csv")
orders = pd.read_csv("../data/orders.csv")
products = pd.read_csv("../data/products.csv")
aisles = pd.read_csv("../data/aisles.csv")
departments = pd.read_csv("../data/departments.csv")
orders_train_all = pd.read_csv("../data/orders_train_new.csv")

# Randomly selecting some of the data

PCA wouldn't run with the number of rows and columns in the original dataframe, so I created a smaller subset from the dataframe I created with all my engineered features

In [None]:
#subsetting from the prior eval set
priors = orders.loc[orders['eval_set']=='prior']

#looking at unique user ID's
prior_users = priors['user_id'].unique().tolist()

#gathering a smaller, random sample from the original set 
#in order to conduct a PCA and have it not take 10 million years
random_prior = random.sample(range(len(prior_users)), 25000)


#new users and the random subset mapped onto each other
users_prior = [prior_users[i] for i in random_prior]

#iterating through user id and making sure it's in the new subset dataframe
sample_orders = orders[orders['user_id'].isin(random_prior)]

#creating a list of orders within this new dataframe
orders_list = sample_orders['order_id'].unique().tolist()

In [None]:
orders_train_all.shape

In [None]:
priors_sample = orders_train_all[orders_train_all["order_id"].isin(orders_list)]
priors_sample.shape

In [None]:
priors_sample.to_csv('../data/priors_sample.csv')

<a id='Clustering'></a>
# Now to the clustering analysis

In [None]:
features = ['product_id', 'aisle_id', 'department_id', 'order_id',
       'add_to_cart_order', 'user_id', 'order_number',
       'order_dow', 'order_hour_of_day', 'days_since_prior_order',
       'average_orders', 'average_order_dow', 'average_order_hour', 'weekend',
       'rate_reordered', 'aisle_cat', 'dept_cat']

#Features
X = priors_sample[features]
y = priors_sample.reordered

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                 random_state = 42,
                 stratify = y)

In [None]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
#instantiate and fit a PCA model
pca = PCA()
X_train_pca = pca.fit_transform(X_train_sc)
X_test_pca = pca.transform(X_test_sc)

In [None]:
#Place PCA into XGBoost
xg = XGBClassifier(max_depth = 12, 
                  min_child_weight= 3,
                   random_state=42)

xg.fit(X_train_pca, y_train)

y_pred = xg.predict(X_test_pca)

In [None]:
print("F1 Score: %.2f%%" % round((f1_score(y_test, y_pred)* 100)))

<a id='Variance'></a>
# Explained Variance

- 83.3% of our variance can be explained by 10 of our variables

In [None]:
var_exp = pca.explained_variance_ratio_
print(f'Explained variance: {np.round(var_exp,3)}')

#cumulative variance explained
cum_var_exp = np.cumsum(var_exp)
print(f'Cumulative explained variance: {np.round(cum_var_exp,3)}')

So, it turns out that clustering doesn't do much better than XGBoost alone.

In [None]:
#from lesson 8.4
plt.figure(figsize=(9,7))

# Plot the cumulative explained variance
component_number = range(len(cum_var_exp))
plt.plot(component_number, cum_var_exp, lw=3)

# Add horizontal lines at y=0 and y=100
plt.axhline(y=0, linewidth=1, color='grey', ls='dashed')
plt.axhline(y=1, linewidth=1, color='grey', ls='dashed')

# Set the x and y axis limits
ax = plt.gca()
ax.set_xlim([-1,26])
ax.set_ylim([-0.05,1.05])

# Label the axes
ax.set_ylabel('cumulative variance explained', fontsize=16)
ax.set_xlabel('component', fontsize=16)

# Make the tick labels bigger
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(12) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(12) 
    
# Add title
ax.set_title('Component vs Cumulative variance explained\n', fontsize=20)

plt.show()