In [None]:
import pandas as pd
import numpy as np
import re
from scipy.stats import ttest_ind
from sklearn.preprocessing import normalize
import scipy.cluster.hierarchy as shc
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
# plt.rcParams['figure.figsize'] =(26,12)



import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import seaborn as sns
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image
import graphviz

import pandas_profiling

In [None]:
df_start = pd.read_csv('../data/modified/data_due_diligence_output.csv', 
                 index_col='CustomerID')

# Select Columns to use in Analysis

In [None]:
df = df_start
df.columns

In [None]:
df

In [None]:
# df.TVWatchingHours.value_counts(bins=10)

In [None]:
#Selecting the columns or variables that you want to use for clustering
cust_df_kmeans = df[['AgeRanges',
                     'PhoneCoTenure', 
                     'RevenueType',
                     'TotalServicesUsed',
                     'Top50_Bottom50'
                    ]]

# Standardize the Data
## Ensures Data is weighted consistently in the analysis

In [None]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cust_kmeans_scaled = scaler.fit_transform(cust_df_kmeans)
#Assign the scaled data to a DataFrame using the index keyword arguments
cust_kmeans_scaled_df = pd.DataFrame(cust_kmeans_scaled, index = cust_df_kmeans.index)
cust_kmeans_scaled_df.head()

# Plot Dendogram
## Evaluates the importance of clusters
### The longer 

In [None]:
# statistics of scaled data
pd.DataFrame(cust_kmeans_scaled_df).describe()

In [None]:
# # Plot inertia for multiple cluster solutions against the number of clusters
# # fitting multiple k-means algorithms and storing the values in an empty list
# SSE = []
# for cluster in range(2,10):
#     kmeans = KMeans(n_jobs = -1, n_clusters = cluster, init='k-means++')
#     kmeans.fit(cust_kmeans_scaled_df)
#     SSE.append(kmeans.inertia_)

# # converting the results into a dataframe and plotting them
# cust_df_frame = pd.DataFrame({'Cluster':range(2,10), 'SSE':SSE})
# plt.figure(figsize=(12,6))
# plt.plot(cust_df_frame['Cluster'], cust_df_frame['SSE'], marker='o')
# plt.xlabel('Number of clusters')
# plt.ylabel('Inertia')

In [None]:
cust_kmeans_scaled

In [None]:
# plt.figure(figsize=(10, 7))  
# plt.title("Dendrograms - Standardized Population")  
# dendr = shc.dendrogram(shc.linkage(cust_kmeans_scaled, method='ward'))

In [None]:
# Defining the kmeans function with initialization as k-means++; # You want to cluster into 5 segments
kmeans_5 = KMeans(n_clusters = 3, init='k-means++')

# fitting the k means algorithm on scaled data
kmeans_5.fit(cust_kmeans_scaled_df)

In [None]:
# Inertia calculates the sum of all the points within a cluster from the centroid of that cluster.
# Determine inertia on the fitted data
kmeans_5.inertia_

In [None]:
pred_clus5 = kmeans_5.predict(cust_kmeans_scaled_df)

In [None]:
cust_df_frame_2 = pd.DataFrame(cust_kmeans_scaled_df)
cust_df_frame_2['cluster'] = pred_clus5
cust_df_frame_2['cluster'].value_counts()

In [None]:
# Using head shows the data structure of the resulting data frame
# There are 10 variables used for clustering and their scaled values are shown
# The last column is the cluster solution and shows the cluster to which obsevation belongs
# There are five segments starting from 0 to 4
cust_df_frame_2.head(5)

In [None]:
# Create a new data column cluster5_2 to incorporate a cluster indicator variable running from 1 to 5
cust_df_frame_2['cluster5_2'] = cust_df_frame_2['cluster'] + 1
cust_df_frame_2.head(5)

In [None]:
# Get the cluster labels
print(kmeans_5.labels_)

In [None]:
cust_df_frame_2['cluster5_2'].value_counts()

In [None]:
plt.scatter(kmeans_5.cluster_centers_[:, 0], kmeans_5.cluster_centers_[:, 1], c='red', marker='x')

In [None]:
# Calculate silhouette_score
from sklearn.metrics import silhouette_score
print(silhouette_score(cust_df_frame_2, kmeans.labels_))

In [None]:
# Import the KElbowVisualizer method 
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Instantiate a scikit-learn K-Means model
model = KMeans(random_state=0)

# Instantiate the KElbowVisualizer with the number of clusters and the metric 
visualizer = KElbowVisualizer(model, k=(2,6), metric='silhouette', timings=False)

# Fit the data and visualize
visualizer.fit(cust_df_frame_2)    
visualizer.poof()  

In [None]:
# Joining the clusters from kmeans to the original data used for clustering
cust_df_clus5_ALL = pd.DataFrame(pd.concat([cust_df_frame_2, df], axis = 1))
cust_df_clus5_ALL.shape

In [None]:
# cust_df_clus5_ALL.info()

In [None]:
# Using pandas crosstabs to create a cross tab of clusters by loan defaults
pd.crosstab(cust_df_clus5_ALL['cluster5_2'],cust_df_clus5_ALL['RevenueType'],margins=True, normalize = 'columns')

In [None]:
#Extracting the cluster data from the original cluster solution and saving it as a DataFrame with an index
cust_df_kmeans_cluster5_2 = pd.DataFrame(cust_df_frame_2['cluster5_2'])
cust_df_kmeans_cluster5_2.head()

In [None]:
# plt.figure(figsize=(10, 7))  
# plt.title("Dendrograms - Non-Normalized Population")  
# dendr = shc.dendrogram(shc.linkage(cust_kmeans_scaled, method='ward'))

In [None]:
# Joining the clusters from kmeans to the original data used for clustering
cust_df_clus5_rf = pd.DataFrame(pd.concat([cust_df_kmeans_cluster5_2, cust_df_kmeans], axis = 1))

In [None]:
# Check on the cluster distribution after extractiona and concatenating
cust_df_clus5_rf['cluster5_2'].value_counts()

In [None]:
def split_data(df, train_perc = 0.8):
    df['train'] = np.random.rand(len(df)) < train_perc
    train = df[df.train == 1]
    test = df[df.train == 0]
    split_data ={'train': train, 'test': test}
    return split_data

In [None]:
split_data(cust_df_kmeans,0.8)

In [None]:
probs = np.random.rand(len(cust_df_clus5_rf))
training_set = probs < 0.8
test_set = (probs>=0.8)

cust_df_kmeans_clus_5_trg = cust_df_clus5_rf[training_set]
cust_df_kmeans_clus_5_tst = cust_df_clus5_rf[test_set]

In [None]:
# Removing the y variable - cluster5_2 from the training data set
cust_df_kmeans_clus_5_trg2 = cust_df_kmeans_clus_5_trg.drop('cluster5_2', 1)
cust_df_kmeans_clu_5_Y = pd.DataFrame(cust_df_kmeans_clus_5_trg['cluster5_2'])
cust_df_kmeans_clus_5_trg2.head()

In [None]:
# Removing the y variable - cluster5_2 from the testing data set
cust_df_kmeans_clus_5_tst2 = cust_df_kmeans_clus_5_tst.drop('cluster5_2', 1)
cust_df_kmeans_clu_5_Y_tst = pd.DataFrame(cust_df_kmeans_clus_5_tst['cluster5_2'])
cust_df_kmeans_clus_5_tst2.head()

In [None]:
cust_df_kmeans_clu_5_Y.head()

In [None]:
cust_df_kmeans_clus_5_trg2.shape

In [None]:
cust_df_kmeans_clus_5_tst.shape

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf_5 = RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_5.fit(cust_df_kmeans_clus_5_trg2, cust_df_kmeans_clu_5_Y)

cust_df_kmeans_clu_5_Y_pred = clf_5.predict(cust_df_kmeans_clus_5_tst2)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(cust_df_kmeans_clu_5_Y_tst, cust_df_kmeans_clu_5_Y_pred))

In [None]:
# Create a list of feature names
feature_names = list(cust_df_kmeans_clus_5_trg2.columns)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Compute feature importances for the features used in the RF model
feature_imp = pd.Series(clf_5.feature_importances_,index = feature_names).sort_values(ascending=False)
print(feature_imp)
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
# Printing a confusion matrix for the test data
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(cust_df_kmeans_clu_5_Y_tst, cust_df_kmeans_clu_5_Y_pred)
print(conf_mat)

In [None]:
df = cust_df_clus5_ALL

In [None]:
df_end = pd.read_csv('../data/modified/data_due_diligence_output.csv', 
                 index_col='CustomerID')

In [None]:
df = df_end.join(df['cluster5_2'])
df

In [None]:
df.to_csv('../data/results/segmentation_results.csv')