

# CASE STUDY - Unsupervised Learning


In [1]:
import os
import joblib
import time
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import BayesianGaussianMixture
from sklearn.svm import SVC
#from imbalanced-learn import imblearn
import imblearn.pipeline as pl
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, SVMSMOTE
    
    
plt.style.use('seaborn')
%matplotlib inline




  
## Outline

1. Create a churn prediction baseline model
2. Use clustering as part of your prediction pipeline
3. Run and experiment to see if re-sampling techniques improve your model


In [2]:
DATA_DIR = os.path.join("D:\\data_science\\Clustering-Case-Study-Local","data")
df = pd.read_csv(os.path.join(DATA_DIR, r"aavail-target.csv"))

## pull out the target and remove uneeded columns
_y = df.pop('is_subscriber')
y = np.zeros(_y.size)
y[_y==0] = 1 
df.drop(columns=['customer_id', 'customer_name'], inplace=True)
df.head()

Unnamed: 0,country,age,subscriber_type,num_streams
0,united_states,21,aavail_premium,23
1,singapore,30,aavail_unlimited,12
2,united_states,21,aavail_premium,22
3,united_states,20,aavail_basic,19
4,singapore,21,aavail_premium,23



Using the train_test_split() function, create a stratified train test split of the data

In [3]:

X_train, X_test, y_train, y_test = train_test_split(df,pd.Series(y), test_size = 0.2, random_state = 10)

In [6]:
import warnings
warnings.filterwarnings('ignore')

## Baseline model

Create a baseline model.  We are going to test whether clustering followed by a model improves the results.  Then, we will test whether re-sampling techniques provide improvements.  Use a pipeline or another method, but create a baseline model given the data. Here is the ColumnTransformer we have used before:

In [4]:
## preprocessing pipeline
numeric_features = ['age', 'num_streams']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_features = ['country', 'subscriber_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encod', OrdinalEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [7]:
# YOUR CODE HERE (Replace the #<> symbols with your code)

from sklearn.linear_model import SGDClassifier

from numpy.random import RandomState as rs

np.random.seed(13)


# Create an instance of a binary classifier. 
clf = SGDClassifier(max_iter=1000, learning_rate = 'constant', eta0 = 0.1, alpha = 0, loss = 'log')


# Create a Pipeline that binds the preprocessing transformer and the classifier estimator.
pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', clf)])


# Fit the pipeline to the training data.
pipe.fit(X_train,y_train)

# predict the dependent variable of the test set.
y_pred = pipe.predict(X_test)

#pr = pipe.predict_proba(X_test)
#pr

# Print the f1_score of the prediction.
print("f1_score", round(f1_score(y_test, y_pred, average='binary'), 3))

f1_score 0.626


### KMeans & GMM

In [8]:
class KmeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=4):
        self.n_clusters = n_clusters
        self.km = KMeans(n_clusters=self.n_clusters, n_init=20)
        
    def transform(self, X, *_):
        labels = self.km.predict(X)
        return np.hstack((X, labels.reshape(-1, 1)))

    def fit(self, X, y=None, *_):
        self.km.fit(X)
        labels = self.km.predict(X)
        self.silhouette_score = round(silhouette_score(X, labels, metric='mahalanobis'), 3)
        return self

class GmmTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=4):
        self.n_clusters = n_clusters
        self.gmm = BayesianGaussianMixture(n_components=self.n_clusters, covariance_type='full',
                                           max_iter=500, n_init=10, warm_start=True)        
    def transform(self, X,*_):
        probs = self.gmm.predict_proba(X) + np.finfo(float).eps
        return np.hstack((X, probs))
        
    def fit(self, X, y=None, *_):
        self.gmm.fit(X)
        labels = self.gmm.predict(X)
        self.silhouette_score = round(silhouette_score(X, labels, metric='mahalanobis'), 3)
        return self
    

    
## example for kmeans
preprocessor.fit(X_train)
X_train_pre = preprocessor.transform(X_train)    
kt = KmeansTransformer(4)
kt.fit(X_train_pre)
X_train_kmeans = kt.transform(X_train_pre)
print(X_train_pre.shape)
print(X_train_kmeans.shape)   
    
## example for GMM  
preprocessor.fit(X_train)
X_train_pre = preprocessor.transform(X_train)    
gt = GmmTransformer(4)
gt.fit(X_train_pre)
X_train_gmm = gt.transform(X_train_pre)
print(X_train_pre.shape)  
print(X_train_gmm.shape)

(800, 4)
(800, 5)
(800, 4)
(800, 8)


In [9]:

def run_clustering_pipeline(umodel):
    """
    This function evaluates different Pipelines comprised of the preprocessing transfomer,
    a clustering transformer and a classifier estimator.
    INPUT : The name of the clustering transformer : 'gmm' or 'kmeans'
    OUTPUT : The list of f1_scores of the pipeline on the test set for the different number of clusters
    """
    fscores= [] # this list will store the f1_score of the different models that we will train
    for n_clusters in np.arange(3, 8):

        # Create an instance of a binary classifier (The same as the one you trained in the previous question)
        clf = SGDClassifier(max_iter=1000, learning_rate = 'constant', eta0 = 0.1, alpha = 0, loss = 'log')
        
        if umodel == 'gmm':
            # Create an instance of the Gmm transformer with n_clusters clusters
            cluster =   GmmTransformer(n_clusters)
        elif umodel == 'kmeans':
            # Create an instance of the Kmean transformer with n_clusters clusters
            cluster =  KmeansTransformer(n_clusters)
        else:
            raise Exception("invalid unsupervised learning model")
        
        # Create a Pipeline that binds the preprocessing transformer, the clustering transformer and the classifier estimator
        pipe = pipe = Pipeline(steps = [
                    ('preprocessor', preprocessor),
                    ('clustering', cluster),
                     ('classifier', clf)
                    ])
    
        # Fit the pipeline on training set
        pipe.fit(X_train, y_train)
        
        # Predict the test set
        y_pred = pipe.predict(X_test)
        
        # Compute the f1 score and add this score to the fscores list.
        score = round(f1_score(y_test, y_pred, average='binary'), 3)
        fscores.append(score)
        
    return fscores

## run the different iteration of the model
cp_results = {}
cp_results['kmeans'] = run_clustering_pipeline('kmeans')
cp_results['gmm'] = run_clustering_pipeline('gmm')

## display table of results
df_cp = pd.DataFrame(cp_results)
df_cp["n_clusters"] = [str(i) for i in np.arange(3,8)]
df_cp.set_index("n_clusters", inplace=True)
df_cp.head(n=10)

Unnamed: 0_level_0,kmeans,gmm
n_clusters,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0.563,0.579
4,0.49,0.559
5,0.596,0.5
6,0.514,0.557
7,0.639,0.574


## Add SMOTE

Run an experiment to see if you can you improve on your workflow with the addition of re-sampling techniques? For instance, you can copy the structure of the function created in the previous question and add a re-sampling transformer to the pipeline.

In [12]:

def run_clustering_pipeline(umodel):
    """
    This function evaluates different Pipelines constituated of the preprocessing transfomer,
    a clustering transformer, a re-sampling transformer and a classifier estimator.
    INPUT : The name of the clustering transformer : 'gmm' or 'kmeans'
    OUTPUT : The list of f1_scores of the pipeline on the test set for the different number of clusters.
    """
    fscores= [] # this list will store the f1_score of the different models that we will train
    for n_clusters in np.arange(3, 8):

        # Create an instance of a binary classifier (The same as the one you trained in the previous question)
        clf = SGDClassifier(max_iter=1000, learning_rate = 'constant', eta0 = 0.1, alpha = 0, loss = 'log')
        
        if umodel == 'gmm':
            # Create an instance of the Gmm transformer with n_clusters clusters
            cluster =   GmmTransformer(n_clusters)
        elif umodel == 'kmeans':
            # Create an instance of the Kmean transformer with n_clusters clusters
            cluster =  KmeansTransformer(n_clusters)
        else:
            raise Exception("invalid unsupervised learning model")
        
        smote = SMOTE(random_state=42)
        # Create a Pipeline that binds the preprocessing transformer, the clustering transformer and the classifier estimator
        pipe = pl.Pipeline(steps = [
                    ('preprocessor', preprocessor),
                    ('clustering', cluster),
                    ('smote', smote),
                     ('classifier', clf) ])
    
        # Fit the pipeline on training set
        pipe.fit(X_train, y_train)
        
        # Predict the test set
        y_pred = pipe.predict(X_test)
        
        # Compute the f1 score and add this score to the fscores list.
        score = round(f1_score(y_test, y_pred, average='binary'), 3)
        fscores.append(score)
      
    return(fscores)

## run the different iteration of the model
cp_results = {}
cp_results['kmeans'] = run_clustering_pipeline('kmeans')
cp_results['gmm'] = run_clustering_pipeline('gmm')


## display table of results
df_cp = pd.DataFrame(cp_results)
df_cp["n_clusters"] = [str(i) for i in np.arange(3,8)]
df_cp.set_index("n_clusters",inplace=True)
df_cp.head(n=10)

Unnamed: 0_level_0,kmeans,gmm
n_clusters,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0.571,0.54
4,0.626,0.626
5,0.569,0.615
6,0.621,0.626
7,0.58,0.603
