# MovieLens Exploratory Dataset
**Purpose:** 

This notebook explores the distribution of input movielens datasets and preprocesses it for the RecSys

**Methodology:**

The notebook assumes input from the [MovieLens Dataset](https://grouplens.org/datasets/movielens/). It will explore the dataset using basic statistics and explores properties of hidden clusters. It outputs a processed dataset of the following format: 

**Author:**

Prajna Soni (@prajnasoni)

In [1]:
# Import the libraries we will be using

#import os
import numpy as np
import pandas as pd
#import math
import matplotlib.pyplot as plt

import logging

from sklearn import metrics
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.model_selection import train_test_split
#from IPython.display import Image
#from sklearn.tree import export_graphviz
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD

#import plotly
import plotly.graph_objs as go

%matplotlib inline

In [2]:
# Load MovieLens dataset

# from lenskit.datasets import ML100K
# movielens = ML100K('ml-100k')
# ratings = movielens.ratings
# movies = movielens.movies

movies = pd.read_csv("../datasets/movielens-small/movies.csv")
ratings = pd.read_csv("../datasets/movielens-small/ratings.csv")

In [None]:
# Timestamp is the timestamp since January 1, 1970 in seconds
# Reduce dimensionality of time so it starts from earliest time as 0 seconds
min_t = ratings['timestamp'].min()
ratings['timestamp'] = ratings['timestamp']-min_t

In [3]:
# MOVIELENS DATASET CLASS - easy access functions to process movielens dataset
class movielens:
    
     # constructor taking in dataset (genre_ratings), number of maximum clusters
    def __init__(self, movies, ratings):
        
        # assign input rating matrix
        self.movies = movies
        self.ratings = ratings
        
        # Identify genres in dataset
        self.get_dummy_genres()
        
        # Enable logging
        self._logger = logging.getLogger(__name__)
        
    def __str__(self):
        return 'MovieLens Dataset'
    
    # Function to return list of strings of genres in MovieLens dataset
    def get_genres(self):
        return self.genres
        
    # Function to return dataframe of user (rows) and movie (columns) ratings - a user-item interaction matrix
    def UserItem(self):
        self.UI_matrix = self.ratings.merge(movies,on='movieId', how='left')
        self.UI_matrix = self.UI_matrix.pivot_table(index='userId',columns='title',values='rating')
        self.UI_matrix = self.UI_matrix.fillna(0)
        return self.UI_matrix 
    
    # Function to get the genre ratings
    def UserGenreRatings(self):
        self.genre_ratings = pd.DataFrame()
        for genre in self.get_genres():        
            genre_movies = self.movies[self.movies['genres'].str.contains(genre)]
            avg_genre_votes_per_user = self.ratings[self.ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
            self.genre_ratings = pd.concat([self.genre_ratings, avg_genre_votes_per_user], axis=1)    
        self.genre_ratings = self.genre_ratings.fillna(0)
        self.genre_ratings.columns = self.get_genres()
        return self.genre_ratings
    
    # Function to get Weighted genre ratings for each user
    # weighted by number of genres a user has rated divided by total number of movies rated
    def w_UserGenreRatings(self): 
        w1 = pd.DataFrame()
        for genre in self.get_genres():
            temp = self.UserGenreCounts()[genre].div(self.TotalUserRatings()['total_ratings'])
            w1[genre] = temp
        self.wGR_matrix = dataGR.mul(w1)
        return self.wGR_matrix

    # Function to get the number of ratings per genre per user
    def UserGenreCounts(self):
        self.genre_counts = pd.DataFrame()
        for genre in self.get_genres():        
            genre_movies = self.movies[self.movies['genres'].str.contains(genre) ]
            genre_counts_per_user = self.ratings[self.ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].count()
            self.genre_counts = pd.concat([self.genre_counts, genre_counts_per_user], axis=1).fillna(0)   
        self.genre_counts.columns = self.genres
        return self.genre_counts

    # Function to count total number of movies a user has rated
    def TotalUserRatings(self):
        total_user_ratings = self.ratings.groupby(['userId']).count().drop(columns = ['movieId','timestamp'], axis = 1)
        total_user_ratings.columns = ['total_ratings']
        return total_user_ratings

    # Function to split movie genres into dummy variables
    def get_dummy_genres(self):
        genres_list = self.movies['genres'].str.split(pat='|') # convert string to list of string
        self.movies2 = pd.concat([self.movies.drop(['genres','title'],axis=1), genres_list.str.join('|').str.get_dummies()], axis=1) # concatenate dummy variables df of genres
        self.genres = self.movies2.columns.tolist()[1:]
        return self.movies2
    
    def SVDmatrix(self, n, dataset='UI'):
        if dataset == 'UI':
            self.UserItem()
            self.UI_SVD =  TruncatedSVD(n_components = n)
            self.UI = pd.DataFrame(self.UI_SVD.fit_transform(self.UI_matrix))
            self.UI.index += 1
            return self.UI
        elif dataset == 'GR':
            self.UserGenreRatings()
            self.GR_SVD =  TruncatedSVD(n_components = n)
            self.GR = pd.DataFrame(self.GR_SVD.fit_transform(self.genre_ratings))
            self.GR.index += 1
            return self.GR
        elif dataset == 'wGR':
            self.w_UserGenreRatings()
            self.wGR_SVD =  TruncatedSVD(n_components = n)
            self.wGR = pd.DataFrame(self.wGR_SVD.fit_transform(self.wGR_matrix))
            self.wGR.index += 1
            return self.wGR
    

In [None]:
# MOVIELENS DATASET CLASS - easy access functions to process movielens dataset
class movielens:
    
     # constructor taking in dataset (genre_ratings), number of maximum clusters
    def __init__(self, movies, ratings):
        
        # assign input rating matrix
        self.movies = movies
        self.ratings = ratings
        
        # Identify genres in dataset
        self.get_dummy_genres()
        
        # Enable logging
        self._logger = logging.getLogger(__name__)
        
    def __str__(self):
        return 'MovieLens Dataset'
    
    # Function to return list of strings of genres in MovieLens dataset
    def get_genres(self):
        return self.genres
        
    # Function to return dataframe of user (rows) and movie (columns) ratings - a user-item interaction matrix
    def UserItem(self):
        self.UI_matrix = self.ratings.merge(movies,on='movieId', how='left')
        self.UI_matrix = self.UI_matrix.pivot_table(index='userId',columns='title',values='rating')
        self.UI_matrix = self.UI_matrix.fillna(0)
        return self.UI_matrix 
    
    # Function to get the genre ratings
    def UserGenreRatings(self):
        self.genre_ratings = pd.DataFrame()
        for genre in self.get_genres():        
            genre_movies = self.movies[self.movies['genres'].str.contains(genre)]
            avg_genre_votes_per_user = self.ratings[self.ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
            self.genre_ratings = pd.concat([self.genre_ratings, avg_genre_votes_per_user], axis=1)    
        self.genre_ratings = self.genre_ratings.fillna(0)
        self.genre_ratings.columns = self.get_genres()
        return self.genre_ratings
    
    # Function to get Weighted genre ratings for each user
    # weighted by number of genres a user has rated divided by total number of movies rated
    def w_UserGenreRatings(self): 
        w1 = pd.DataFrame()
        for genre in self.get_genres():
            temp = self.UserGenreCounts()[genre].div(self.TotalUserRatings()['total_ratings'])
            w1[genre] = temp
        self.wGR_matrix = dataGR.mul(w1)
        return self.wGR_matrix

    # Function to get the number of ratings per genre per user
    def UserGenreCounts(self):
        self.genre_counts = pd.DataFrame()
        for genre in self.get_genres():        
            genre_movies = self.movies[self.movies['genres'].str.contains(genre) ]
            genre_counts_per_user = self.ratings[self.ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].count()
            self.genre_counts = pd.concat([self.genre_counts, genre_counts_per_user], axis=1).fillna(0)   
        self.genre_counts.columns = self.genres
        return self.genre_counts

    # Function to count total number of movies a user has rated
    def TotalUserRatings(self):
        total_user_ratings = self.ratings.groupby(['userId']).count().drop(columns = ['movieId','timestamp'], axis = 1)
        total_user_ratings.columns = ['total_ratings']
        return total_user_ratings

    # Function to split movie genres into dummy variables
    def get_dummy_genres(self):
        genres_list = self.movies['genres'].str.split(pat='|') # convert string to list of string
        self.movies2 = pd.concat([movies.drop(['genres','title'],axis=1), genres_list.str.join('|').str.get_dummies()], axis=1) # concatenate dummy variables df of genres
        self.genres = self.movies2.columns.tolist()[1:]
        return self.movies2
    
    def SVDmatrix(self, n, dataset='UI'):
        if dataset == 'UI':
            self.UserItem()
            self.UI_SVD =  TruncatedSVD(n_components = n)
            self.UI = pd.DataFrame(self.UI_SVD.fit_transform(self.UI_matrix))
            self.UI.index += 1
            return self.UI
        elif dataset == 'GR':
            self.UserGenreRatings()
            self.GR_SVD =  TruncatedSVD(n_components = n)
            self.GR = pd.DataFrame(self.GR_SVD.fit_transform(self.genre_ratings))
            self.GR.index += 1
            return self.GR
        elif dataset == 'wGR':
            self.w_UserGenreRatings()
            self.wGR_SVD =  TruncatedSVD(n_components = n)
            self.wGR = pd.DataFrame(self.wGR_SVD.fit_transform(self.wGR_matrix))
            self.wGR.index += 1
            return self.wGR

In [4]:
# Create MovieLens Object with DFs from loaded CSVs
data = movielens(movies, ratings)

In [None]:
data.SVDmatrix(3)

# Distribution of Dataset

In [None]:
# Calculate total ratings of each unique rating value (in 0.5 increments)
rating_count = ratings['rating'].value_counts().sort_values(ascending=False)
# Calculate each user's mean ratings across all movies rated by that user 
avg_user = ratings.drop(columns = ['movieId','timestamp'], axis = 1).groupby(['userId']).mean()
# Calculate 
rating_range = [0,1,2,3,4,5]
avg_movie = ratings.drop(columns = ['userId','timestamp'], axis = 1).groupby(['movieId']).mean()
avg_movie = pd.cut(avg_movie['rating'], bins=rating_range, include_lowest=True).value_counts(sort=False)

fig, plts = plt.subplots(1,2)
fig.set_figwidth(14)
fig.set_figheight(5)
fig.suptitle('Distribution of Dataset')
plts[0].set_title('Distribution of Total Ratings (Count)',fontsize=12)
plts[0].bar(x=rating_count.index,height=rating_count.values, width=0.3)

plts[1].set_title('Mean Ratings per User = ' + str(avg_user.mean().values)+'/5.00',fontsize=12)
plts[1].plot(avg_user.index,avg_user.values)
plts[1].set_ylim(0,5)


In [None]:
avg_movie.plot.bar(title='Distribution of Mean Movie Ratings (Count)')

In [None]:
dummy_genres = data.get_dummy_genres()
genre_counts = []
for genre in data.get_genres():
    genre_counts.append(dummy_genres[genre].sum())

plt.figure(figsize = (21,5))
plt.title('Distribution of movie items across genres', fontsize=15)
plt.bar(data.get_genres(),genre_counts)


# Dataset Creation

In [None]:
# Creation of different datasets for SVD (weighted, absolute)

# User-Item Interaction Matrix
dataUI = data.UserItem()

# Average rating for each genre for each user
dataGR = data.UserGenreRatings()

# Weighted genre ratings for each user
dataGR_w1 = data.w_UserGenreRatings()

In [None]:
dataGR_w1

The sum of weights across a user is greater than 1 as each movie can have more than one genre.
The weight in w1[i,j] represents the ratio of movies rated by user[i] in genre[j] to the total number of movies rated by user[i]

$$ \sum_{i=1}^{n} w1[i,j] \geq 1 \forall j $$

$$ w1[i,j] = \frac{UserGenreCounts[i,j]}{TotalUserRatings[i]}$$

In [None]:
print("sum of weights for each user:") 
w1.sum(axis=1).head()

In [None]:
print('Correlation of unweighted ratings for sample genres for each user')
plt.figure(figsize=(21,7))
plt.subplot(1,3,1)
plt.scatter(x=dataGR['Action'].to_numpy(),y=dataGR['Adventure'].to_numpy())
plt.xlabel('average action rating')
plt.ylabel('average adventure rating')

plt.subplot(1,3,2)
plt.scatter(x=dataGR['Sci-Fi'].to_numpy(),y=dataGR['Crime'].to_numpy())
plt.xlabel('average scifi rating')
plt.ylabel('average crime rating')

plt.subplot(1,3,3)
plt.scatter(x=dataGR['Thriller'].to_numpy(),y=dataGR['Horror'].to_numpy())
plt.xlabel('average thriller rating')
plt.ylabel('average horror rating')
plt.show()

print('Correlation of weighted ratings for sample genres for each user')
plt.figure(figsize=(21,7))
plt.subplot(1,3,1)
plt.scatter(x=dataGR_w1['Action'].to_numpy(),y=dataGR_w1['Adventure'].to_numpy())
plt.xlabel('average action rating')
plt.ylabel('average adventure rating')

plt.subplot(1,3,2)
plt.scatter(x=dataGR_w1['Sci-Fi'].to_numpy(),y=dataGR_w1['Crime'].to_numpy())
plt.xlabel('average scifi rating')
plt.ylabel('average crime rating')

plt.subplot(1,3,3)
plt.scatter(x=dataGR_w1['Thriller'].to_numpy(),y=dataGR_w1['Horror'].to_numpy())
plt.xlabel('average thriller rating')
plt.ylabel('average horror rating')


plt.show()

# Dimensionality Reduction using SVD

In [None]:
# User-Movie Rating matrix (complete unfiltered rating matrix from MovieLens dataset)
dataUI

In [None]:
%%time
# Decomposition of Singular Values from SVD for Dimensionality Reduction of UserItem matrix
#UI_SVD_full =  TruncatedSVD(n_components = min((len(dataUI)-1),(len(dataUI.columns)-1)))
UI_SVD_full =  TruncatedSVD(n_components = 10)
UI = pd.DataFrame(UI_SVD_full.fit_transform(dataUI))
UI.index += 1

In [None]:
print("First 10 singular values from SVD of UserItem Interaction Matrix")
print(UI_SVD_full.singular_values_[:10])

# Plot Singular Value Decomposition 
plt.figure(figsize=(21,5))
plt.plot(UI_SVD_full.singular_values_)
plt.show()

In [None]:
# Decomposition of Singular Values from SVD for Dimensionality Reduction of UserItem matrix
GR_SVD_full =  TruncatedSVD(n_components = len(dataGR.columns)-1)
GR = pd.DataFrame(GR_SVD_full.fit_transform(dataGR))
GR.index += 1

print("First 10 singular values from SVD of Unweighted User Genre Rating Matrix")
print(GR_SVD_full.singular_values_[:10])

# Plot Singular Value Decomposition 
plt.figure(figsize=(21,5))
plt.plot(GR_SVD_full.singular_values_)
plt.show()

In [None]:
# Decomposition of Singular Values from SVD for Dimensionality Reduction of UserItem matrix
GRw1_SVD_full =  TruncatedSVD(n_components = len(dataGR_w1.columns)-1)
GRw1 = pd.DataFrame(GRw1_SVD_full.fit_transform(dataGR_w1))
GRw1.index += 1

print("First 10 singular values from SVD of Weighted User Genre Rating Matrix")
print(GRw1_SVD_full.singular_values_[:10])

# Plot Singular Value Decomposition 
plt.figure(figsize=(21,5))
plt.plot(GRw1_SVD_full.singular_values_)
plt.show()

In [None]:
#UI_temp = UI.iloc[:,:3]

In [None]:
# UserInteraction Matrix
plt.figure(figsize=(14,5))
plt.subplot(1,3,1)
plt.title('User Interaction Matrix')
plt.scatter(UI[0], UI[1])
plt.xlabel('Latent Feature 0')
plt.ylabel('Latent Feature 1')

# UserGenreRatings Matrix
plt.subplot(1,3,2)
plt.title('User Genre Rating Matrix')
plt.scatter(GR[0], GR[1])
plt.xlabel('Latent Feature 0')
plt.ylabel('Latent Feature 1')

# WeightedUserGenreRatings Matrix
plt.subplot(1,3,3)
plt.title('Weighted User Genre Rating Matrix')
plt.scatter(GRw1[0], GRw1[1])
plt.xlabel('Latent Feature 0')
plt.ylabel('Latent Feature 1')
plt.show()

# Clustering

In [None]:
# CLASS TO CLUSTER AND EVALUATE DATA
class data:

    # constructor taking in dataset (genre_ratings), number of maximum clusters
    def __init__(self, data):
        # assign input rating matrix
        self.data = data 
        # Enable logging
        self._logger = logging.getLogger(__name__)
        
    def __str__(self):
        return 'Data Object'
    
    # perform kmeans clustering for n clusters on data and return a dataframe with user and cluster number 
    def kmeans(self, n):
        
        if n is None:
            self._logger.warning('Number of clusters not provided')
            return None
        
        km = KMeans(n_clusters=n, init='k-means++', max_iter=300, n_init=10, random_state=0)
        self.km_pred = km.fit_predict(self.data)
        self.km_pred = pd.DataFrame(self.km_pred, columns = ['cluster'])
        self.km_pred.index += 1 # adjust index to match userId
        #clustered_data = pd.concat([self.data, km_pred], axis=1)
        return self.km_pred
    
    # print graphs to evaluate kmeans clustering from 2 to n clusters using kmeans score, silhouette score and davies-bouldin score
    def kmeans_eval(self, n):
        
        if n is None:
            self._logger.warning('Number of maximum clusters not provided')
            return None
        
        # variable scope limited to function
        km_scores= []
        km_silhouette = []
        db_score = []
        
        # calculate scores 
        for i in range(2,n+1):
            km = KMeans(n_clusters=i, random_state=0, max_iter=300).fit(self.data)
            km_pred = km.predict(self.data)

            #KM Score
            km_scores.append(-kmeans.score(self.data))

            #Silhouette Score
            km_silhouette.append(metrics.silhouette_score(self.data, km_pred))

            #Davies Bouldin Score
            # the average similarity measure of each cluster with its most similar cluster, 
            # where similarity is the ratio of within-cluster distances to between-cluster distances. 
            # Thus, clusters which are farther apart and less dispersed will result in a better score.
            db_score.append(metrics.davies_bouldin_score(self.data, km_pred))

        # plot graphs of evaluation metrics
        # ELBOW METHOD (optimal cluster at elbow in curve)
        plt.figure(figsize=(14,21))
        plt.subplot(3,1,1)
        plt.title("The elbow method for determining number of clusters",fontsize=16)
        plt.scatter(x=[i for i in range(2,n+1)],y=km_scores,s=150,edgecolor='k')
        plt.grid(True)
        plt.xlabel("Number of clusters",fontsize=14)
        plt.ylabel("K-means Score",fontsize=15)
        plt.xticks([i for i in range(2,n+1)],fontsize=14)
        plt.yticks(fontsize=15)
        
        # SILHOUETTE SCORE (silhouette score varies from [-1,1] with 1 meaning clearly defined clusters)
        plt.subplot(3,1,2)
        plt.title("The silhouette coefficient method for determining number of clusters (1 is ideal)",fontsize=16)
        plt.scatter(x=[i for i in range(2,n+1)],y=km_silhouette,s=150,edgecolor='k')
        plt.grid(True)
        plt.xlabel("Number of clusters",fontsize=14)
        plt.ylabel("Average Silhouette Score",fontsize=15)
        plt.ylim(-1,1)
        plt.xticks([i for i in range(2,n+1)],fontsize=14)
        plt.yticks(fontsize=15)
       
        # DAVIES-BOULDIN SCORE (lower score is better and means more disctinct clusters)
        plt.subplot(3,1,3)
        plt.title("The davies-bouldin coefficient method for determining number of clusters (0 is ideal)",fontsize=16)
        plt.scatter(x=[i for i in range(2,n+1)],y=db_score,s=150,edgecolor='k')
        plt.grid(True)
        plt.xlabel("Number of clusters")
        plt.ylabel("Davies-Bouldin Score")
        plt.ylim(bottom = 0)
        plt.xticks([i for i in range(2,n+1)],fontsize=14)
        plt.yticks(fontsize=15)
        
        plt.show()

    # perform GaussianMixture clustering for n clusters on data and return a dataframe with user and cluster number
    def gmm(self, n, covariance_type='full', df='None'):
        # n = number of clusters
        # covariance_type is 'full', 'diag', 'tied' or 'spherical'
        # df is 'pred' for cluster predictions, 'proba' for cluster probabilities, and 'full' for input data combined with probabilities
        if n is None:
            self._logger.warning('Number of maximum clusters not provided')
            return None
        
        if covariance_type is None:
            self._logger.warning('Covariance Type for Gaussian Mixture Model not provided. Default is "full".')
            return None
        
        if df is None:
            self._logger.warning('Return df format not provided. Default is "pred".')
            return None
        
        gmm = GaussianMixture(n_components=n, n_init=10, covariance_type=covariance_type, tol=1e-3, max_iter=500)
        self.gmm_pred = gmm.fit_predict(self.data)
        self.gmm_pred = pd.DataFrame(self.gmm_pred, columns = ['cluster'])
        
        # Return new datafram with clusters, and probability of belonging to a cluster 
        if df == 'pred':
            self.gmm_pred.index += 1
            return self.gmm_pred
        elif df == 'proba':
            cols = ['proba_C'+str(int) for int in range(n)]
            proba = self.gmm_pred.join(pd.DataFrame(gmm.predict_proba(UI_temp), columns = cols))
            proba.index += 1 # adjust index to match userId
            return proba
        elif df == 'all':
            cols = ['proba_C'+str(int) for int in range(n)]
            proba = self.gmm_pred.join(pd.DataFrame(gmm.predict_proba(UI_temp), columns = cols))
            proba.index += 1 # adjust index to match userId
            full = self.data.join(proba ,how='left')
            return full
        elif df == 'None':
            return None
        else:
            self._logger.error("Invalid input. Enter 'all', 'pred' or 'proba'.")
            return None
    
    # print graphs to evaluate kmeans clustering from 2 to n clusters using 
    def gmm_eval(self, n, covariance_type="full"):
        
        if n is None:
            self._logger.error('Number of maximum clusters not provided')
            return None
        
        if covariance_type is None:
            self._logger.warning('Covariance Type for Gaussian Mixture Model not provided. Default is "full"')
            return None
        
        # variable scope limited to function
        gmm_aic = []
        gmm_bic = []
        gmm_scores = [] 
        
        # calculate scores 
        for i in range(2,n+1):
            gmm = GaussianMixture(n_components=i,n_init=10, covariance_type = covariance_type, tol=1e-3,max_iter=500).fit(self.data)
            
            # Akaike Information Criterion
            gmm_aic.append(gmm.aic(self.data))
            
            # Bayesian Information Criterion
            gmm_bic.append(gmm.bic(self.data))
            
            gmm_scores.append(gmm.score(self.data))
            
        # Plot the scores 
        plt.figure(figsize=(14,21))
        plt.subplot(3,1,1)
        #plt.title("The Gaussian Mixture model AIC for determining number of clusters, CT = "+covariance_type,fontsize=16)
        plt.scatter(x=[i for i in range(2,n+1)],y=np.log(gmm_aic),s=150,edgecolor='k')
        plt.grid(True)
        plt.xlabel("Number of clusters",fontsize=14)
        plt.ylabel("Log of Gaussian mixture AIC score",fontsize=15)
        plt.xticks([i for i in range(2,n+1)],fontsize=14)
        plt.yticks(fontsize=15)

        plt.subplot(3,1,2)
        #plt.title("The Gaussian Mixture model BIC for determining number of clusters, CT = "+covariance_type,fontsize=16)
        plt.scatter(x=[i for i in range(2,n+1)],y=np.log(gmm_bic),s=150,edgecolor='k')
        plt.grid(True)
        plt.xlabel("Number of clusters",fontsize=14)
        plt.ylabel("Log of Gaussian mixture BIC score",fontsize=15)
        plt.xticks([i for i in range(2,n+1)],fontsize=14)
        plt.yticks(fontsize=15)
   
        plt.subplot(3,1,3)
        #plt.title("The Gaussian Mixture model scores for determining number of clusters, CT = "+covariance_type,fontsize=16)
        plt.scatter(x=[i for i in range(2,n+1)],y=gmm_scores,s=150,edgecolor='k')
        plt.grid(True)
        plt.xlabel("Number of clusters",fontsize=14)
        plt.ylabel("Gaussian mixture score",fontsize=15)
        plt.xticks([i for i in range(2,n+1)],fontsize=14)
        plt.yticks(fontsize=15)
        plt.show()

    def plotScatter(self, show_cluster, model):
        
        # logger warning if no clusters to plot/colour  
        if show_cluster:
            if model == 'gmm':
                if self.gmm_pred is None:
                    self._logger.error("Gaussian Mixture Model not trained. Use data.gmm(n, covariance_type, df) to train before plotting")
                    return None
                clusters = self.gmm_pred
            elif model == 'kmeans':
                if self.km_pred is None:
                    self._logger.error("K-Means Model not trained. Use data.kmeans(n) to train before plotting")
                    return None
                clusters = self.km_pred
            marker = {'size': 3,'opacity': 0.8,'color':clusters['cluster'],'colorscale':'Viridis'}
        else:
            marker = {'size': 3,'opacity': 0.8,'colorscale':'Viridis'}
        
        # check input dataset to plot
        if len(self.data.columns) >= 3:
            if len(self.data.columns) > 3:
                self._logger.warning("Input dataset contains more than 3 features. 3D scatter plot will only plot first 3 features.")            
            
            # plot 3D scatter plot
            # Configure Plotly to be rendered inline in the notebook.
            plotly.offline.init_notebook_mode()
            # Configure the trace.
            trace = go.Scatter3d(
                x=self.data[0],  # <-- Put your data instead
                y=self.data[1],  # <-- Put your data instead
                z=self.data[2],  # <-- Put your data instead
                mode='markers',
                marker=marker
            )
            # Configure the layout.
            layout = go.Layout(
                margin={'l': 0, 'r': 0, 'b': 0, 't': 0}
            )
            data = [trace]
            plot_figure = go.Figure(data=data, layout=layout)
            # Render the plot.
            plotly.offline.iplot(plot_figure)

        elif len(self.data.columns) == 2:
            self._logger.warning("Input dataset contains only 2 features. 2D scatter plot will be created.")
            
            # plot 2D scatter plot
            fig = go.Figure(data=go.Scatter(
                x=self.data[0], 
                y=self.data[1], 
                mode='markers', 
                marker=marker))
            fig.show()
            return None
        else:
            self._logger.error("Input dataset contains less than 2 features. Insufficient data to plot.")
            return None
        
       

In [None]:
# create data object
UI_data = data(UI.iloc[:,:2])
# perform KMeans clustering to get cluster values
UI_data.kmeans(3)
# perform GMM clustering to get cluster values
UI_data.gmm(n=3,covariance_type="full")
# plot data and clusters (colour-coded)
UI_data.plotScatter(True, 'kmeans')

In [None]:
GR_data = data(GR.iloc[:,:2])
GR_data.kmeans(3)
GR_data.gmm(n=2,covariance_type="full")
GR_data.plotScatter(True,'gmm')

In [None]:
GRw1_data = data(GRw1.iloc[:,:3])
GRw1_data.kmeans(3)
GRw1_data.gmm(n=3,covariance_type="full")
GRw1_data.plotScatter(True,'kmeans')

In [None]:
%%time
GR_data.gmm(3)

In [None]:
GR = data(dataGR)
GR.gmm_eval(n=20,covariance_type="full")

In [None]:
GRw1 = data(dataGR_w1)
GRw1.gmm_eval(n=20,covariance_type="full")

# Examine distribution of clusters 
cluster_count = predictions['cluster'].value_counts()
cluster_count.plot.bar(title='Distribution of Users in Clusters (Count)')

# Gaussian Mixture Modelling Clustering

In [None]:
# Assign features and target variable
X = UI_temp
Y = clustered['cluster']

In [None]:
# Train a decision tree to understand the definiing features separating the clusters 
# and understand how the entropy changes 

# We split the data into a training (80%) and testing (20%) dataset
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.20, random_state=0, stratify=Y)
#declare decision tree classifier classifiying based on entropy 
KMeans_Tree = DecisionTreeClassifier(max_depth =3, criterion ='entropy')
#train decision tree classifier on training data
KMeans_Tree.fit(train_X, train_Y)
#get predicted results for given test_X
KMeans_TreePred = KMeans_Tree.predict(test_X)
acc_score = metrics.accuracy_score(test_Y, KMeans_TreePred)
print(acc_score)
#metrics.confusion_matrix(test_Y, KMeans_TreePred)

In [None]:
class_names = [str(int) for int in range(26)]
#Let's visualise the tree
tree.export_graphviz(KMeans_Tree, out_file = 'KMeans_Tree.dot', feature_names = X.columns, class_names = [str(int) for int in range(26)], filled = True)
# Convert to png
from subprocess import call
call(['dot', '-Tpng', 'KMeans_Tree.dot', '-o', 'KMeans_Tree.png', '-Gdpi=600'])

# Display in python
import matplotlib.pyplot as plt
plt.figure(figsize = (14, 18))
plt.imshow(plt.imread('KMeans_Tree.png'))
plt.axis('off');
plt.show();