<a href="https://colab.research.google.com/github/mvfolino68/NLP_CFPB_complaints_notebook/blob/master/CFPB%20Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import modules

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [0]:
import itertools
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import Normalizer


from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D


from plotly.offline import download_plotlyjs, init_notebook_mode,  plot
from plotly.graph_objs import *
init_notebook_mode()

import pandas as pd
import numpy as np

import string

import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

print(__doc__)


Import data

In [0]:
df = pd.read_csv('https://data.consumerfinance.gov/api/views/s6ew-h6mp/rows.csv?accessType=DOWNLOAD', encoding='utf-8')

Create cleaing function

In [0]:
#clreate word lemmetizer object
lemmatizer = WordNetLemmatizer() 

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. remove x, M, T
    3. Remove all stopwords
    4. remove any non alpha words
    5. Returns a list of the cleaned text
    """
    
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = [char for char in mess if char not in ['X', 'M','T']]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    clean_words = [lemmatizer.lemmatize(word) for word in nopunc.split() if word.lower() not in ENGLISH_STOP_WORDS ]
    clean_words = [word for word in clean_words if word.lower().isalpha()]
    
    # Now just remove any stopwords
    return [word for word in clean_words if word.lower() not in ENGLISH_STOP_WORDS]

Remove nulls

In [0]:
bank_name = 'M&T BANK CORPORATION'
df = df[pd.notnull(df['Consumer complaint narrative'].str.strip())]
df2 =df[df['Company'].str.contains(bank_name)]
df2['Consumer complaint narrative'] = df2['Consumer complaint narrative'].apply(text_process)
df2.head()


In [0]:
complaint_train = df2

vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=1500,
                             min_df=2,
                             stop_words=ENGLISH_STOP_WORDS,
                             analyzer=text_process,
                             use_idf=True)

X = vectorizer.fit_transform(df2['Consumer complaint narrative'].tolist())

Perform Matrix Decomposition

In [0]:
print("Performing dimensionality reduction using LSA")

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized

#reduce to 3 dimensions
dimensions = 3

svd = TruncatedSVD(dimensions)

normalizer = Normalizer(copy=False)

lsa = make_pipeline(svd, normalizer)

x_transformed = lsa.fit_transform(X)

Plot in 3D

In [0]:
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(x_transformed[:,0], x_transformed[:,1],x_transformed[:,2])

Do the Clustering

In [0]:
range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-1, 1]
    ax1.set_xlim([-.01, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(x_transformed) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    clusterer = KMeans(n_clusters=n_clusters, random_state=0, init='k-means++', max_iter=100, n_init=1)
    cluster_labels = clusterer.fit_predict(x_transformed)
    complaint_train.loc[:,'Cluster_of_{}'.format(n_clusters)] = cluster_labels+1
    complaint_train.loc[:,'x1'] = x_transformed[ : , 0]
    complaint_train.loc[:,'x2'] = x_transformed[ : , 1]
    complaint_train.loc[:,'x3'] = x_transformed[ : , 2]


    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(x_transformed, cluster_labels)
    
    print()
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)    
    print("Top terms per cluster:")

#     if opts.n_components:
    original_space_centroids = svd.inverse_transform(clusterer.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]


#    order_centroids = clusterer.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(n_clusters):
        print("Cluster " +str(i+1)+":", end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(x_transformed, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(x_transformed[:, 0], x_transformed[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("Customer Complaint Text Data in 2D Space")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("KMeans clustering on M&T Bank Customer Complaints "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

In [0]:
import plotly 
plotly.tools.set_credentials_file(username='mvfolino68', api_key='')
import plotly.plotly as py
import plotly.graph_objs as go
import pandas as pd



traces = []
for i in complaint_train.Cluster_of_3.sort_values(ascending=True).unique():
    df_by_cluster = complaint_train[complaint_train['Cluster_of_3'] == i]
    traces.append(go.Scatter3d(
        x=df_by_cluster['x1'],
        y=df_by_cluster['x2'],
        z=df_by_cluster['x3'],
        text=df_by_cluster['Consumer complaint narrative'],
        mode='markers',
        opacity=0.9,
        marker={
            'size': 10,
            'line': {'width': 0.5, 'color': 'white'}
        },
        name='Cluster #'+str(i)
    ))
data = traces
layout = go.Layout(
                scene={  
                        'xaxis':{'type': 'linear', 'title': 'X axis'},
                           'yaxis':{'title': 'Y axis'},
                           'zaxis':{'type': 'linear', 'title': 'Z axis'}
                           },
            margin={'l': 40, 'b': 40, 't': 10, 'r': 10},
            legend={'x': 1, 'y': 1},
            hovermode='closest',
            title=go.layout.Title(
            text='M&T Bank Consumer Complaints Natural Language Processing and Machine Learning Case',
            xref='paper',
            x=.5, y=1)
                    )
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='test')