In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
from matplotlib import rc
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.pipeline import Pipeline
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, mean_squared_error
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
import time

In [2]:
%matplotlib inline
pd.set_option('max_columns',500)
font = {'size': 20}
rc('font', **font)
plt.style.use('seaborn-bright')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Functions

In [7]:
def kmeans(X_km, clusters):
    SSE_arr = []
    ss_arr = []
    for i in clusters:
        kmeans = KMeans(n_clusters=i, n_jobs=-1)
        clust_dist = kmeans.fit_transform(X_km)
        clust_num = kmeans.predict(X_km)

        SSE = 0
        for a, b in zip(clust_dist, clust_num):
            SSE += a[b] ** 2
        SSE_arr.append(SSE)
        
        if i > 1:
            ss_arr.append(silhouette_score(X_km, clust_num))
    return SSE_arr, ss_arr

In [8]:
def silhouette(clusters, ss_arr):
    plt.figure(figsize=(12,8))
    plt.title('Silhouette Scores')
    plt.plot(clusters, ss_arr)
    plt.grid(alpha=0.3)
    plt.xticks(clusters)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score');

## Create Customer Table

In [3]:
df = pd.read_pickle('data/SRP/clean_data_public_no_crime_lag0.pkl')

In [6]:
cust_table = df.groupby(['address1']).mean()[['qty_shrink_per_day', 'shrink_value_per_day', 'POP2010',
                                              'FD_ratio', 'unemp_rate', 'dens_sq_mile', ]].reset_index()
cust_table.set_index('address1', inplace=True)

city_i = df.columns.get_loc('city')
state_i = df.columns.get_loc('state')
zip_i = df.columns.get_loc('zip_code')
cust_i = df.columns.get_loc('customer_id')
for index, row in cust_table.iterrows():
    foo = df[ df.address1 == index]
    for i, r in foo.iterrows():
        city = r[city_i]
        state = r[state_i]
        zip_code = r[zip_i]
        cust_id = r[cust_i]
        
        cust_table.set_value(index, 'city', city)
        cust_table.set_value(index, 'state', state)
        cust_table.set_value(index, 'zip_code', zip_code)
        cust_table.set_value(index, 'customer_id', cust_id)
        break



In [None]:
dummy_cust = pd.get_dummies(cust_table, columns=['customer_id','zip_code'])
# including shrink and not inluding dummies
shrink_cust_mask = (dummy_cust.dtypes == float)
shrink_cust_cols = dummy_cust.columns[shrink_cust_mask]

# including dummies but not shrink
dummy_cust_mask = (dummy_cust.dtypes == float) | (dummy_cust.dtypes == np.uint8)
dummy_cust_cols = dummy_cust.columns[dummy_cust_mask]
dummy_cust_cols = list(dummy_cust_cols)
dummy_cust_cols.remove('qty_shrink_per_day')
dummy_cust_cols.remove('shrink_value_per_day')

# including dummies and shrink
all_cust_mask = (dummy_cust.dtypes == float) | (dummy_cust.dtypes == np.uint8)
all_cust_cols = dummy_cust.columns[dummy_cust_mask]

std_cust = std_f(dummy_cust.copy())

## Feature Importance

In [None]:
# feature importance via lasso regression
print('Shape before regularization: ',std_cust[dummy_cust_cols].shape)
lasso = Lasso(alpha=0.01)
lasso.fit(std_cust[dummy_cust_cols], std_cust['shrink_value_per_day'])
model = SelectFromModel(lasso, prefit=True)
std_cust_reduc = model.transform(std_cust[dummy_cust_cols])
print('Shape after regularization: ',std_cust_reduc.shape)
std_cust_reduc

In [None]:
#feature importance via linear SVR
print('Shape before regularization: ',std_cust[dummy_cust_cols].shape)
lsvr = LinearSVR(C=0.01, loss='epsilon_insensitive', dual=True)
lsvr.fit(std_cust[dummy_cust_cols], std_cust['shrink_value_per_day'])
model = SelectFromModel(lsvr, prefit=True)
std_cust_reduc = model.transform(std_cust[dummy_cust_cols])
print('Shape after regularization: ',std_cust_reduc.shape)
std_cust_reduc
model.get_support

## Clustering/Segmentation

In [None]:
# columns to use in segmentation:
pca_cols = ['qty_shrink_per_day', 'shrink_value_per_day', 'FD_ratio', 'dens_sq_mile', 'POP2010', 'unemp_rate']
clusters = np.arange(1, 15)
SSE_arr, ss_arr = kmeans(std_cust[all_cust_cols], clusters)
#elbow(clusters, SSE_arr)
silhouette(np.arange(2, 15), ss_arr)

In [None]:
cust_kmeans = KMeans(n_clusters=5, max_iter=10000, n_jobs=-1)
pred = cust_kmeans.fit_predict(std_cust[all_cust_cols])
dummy_cust['cluster'] = pred

In [None]:
# see breakdown of clusters
print(dummy_cust.groupby('cluster').count().city)
dummy_cust.groupby('cluster').mean()[all_cust_cols]

### Plot Clusters

In [None]:
# 2-D plot
cust_pca = PCA(2)
pcas = cust_pca.fit_transform(std_cust[all_cust_cols])

plt.figure(figsize=(12,12))
plt.scatter(pcas[:,0], pcas[:,1], c=dummy_cust.cluster)

In [None]:
# 3-D Plot
cust_pca = PCA(3)
pcas = cust_pca.fit_transform(std_cust[all_cust_cols])

fig = plt.figure(figsize=(12,12))
ax = Axes3D(fig)
ax.scatter(pcas[:,0], pcas[:,1], pcas[:,2], s=20, alpha=1, c=dummy_cust.cluster)
ax.set_xlim(left=-10, right=2)
ax.set_ylim(bottom=0, top=10)
ax.set_zlim(top=5)

## Append Clusters to Customer Table

In [None]:
#cust_table_clust = cust_table[['cluster']].astype(str)
cust_table_clust = dummy_cust[['cluster']].astype(str)
cust_table_clust.info()

In [None]:
cust_table_clust.to_pickle('data/SRP/cust_table.pkl')