In [85]:
#Import Statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth

In [86]:
#Import and Cleaning of the Dataset
df = pd.read_excel(r'Take home exam dataset.xlsx.xlsx', sheet_name='Sheet1', header=[0,1])

# Removing the Null Column and Grouping Column
del df['Unnamed: 0_level_0']
del df['Unnamed: 1_level_0']

# Change from Percentages to Number of People
numberDf = df.apply(lambda x: round(16*(x/100)))

# Conditions for the setup
peoplePerGroup = 16
numGroups = 4
numQuestions = 15

# Generating the array
peopleSamples = np.zeros((peoplePerGroup * numGroups, numQuestions))

# Building the Dataset
for row in range(numGroups):
    # numberDf.iloc[row:row+1, :] => Gets Row
    #print(numberDf.iloc[row:row+1, :])
    questionIndex = 0
    for question in range(0, len(numberDf.columns)-3+1, 3):
        personIndex = 0 + (peoplePerGroup * row)
        for response in range(0, 3):
            numResponses = int(numberDf.iloc[row, question+response])
            for person in range(0, numResponses):
                peopleSamples[personIndex][questionIndex] = int(response)
                personIndex += 1
        questionIndex += 1

for row in range(len(peopleSamples)):
    for column in range(0, 15):
        np.random.shuffle(peopleSamples[(row*16):((row+1)*16),column]) 

# Conversion from np.Array to Dataframe
peopleDataset = pd.DataFrame({'Q1':peopleSamples[:, 0],'Q2':peopleSamples[:, 1],'Q3':peopleSamples[:, 2],
                        'Q4':peopleSamples[:, 3],'Q5':peopleSamples[:, 4],'Q6':peopleSamples[:, 5],
                        'Q7':peopleSamples[:, 6],'Q8':peopleSamples[:, 7],'Q9':peopleSamples[:, 8],
                        'Q10':peopleSamples[:, 9],'Q11':peopleSamples[:, 10],'Q12':peopleSamples[:, 11],
                        'Q13':peopleSamples[:, 12],'Q14':peopleSamples[:, 13],'Q15':peopleSamples[:, 14]})

# Assigning to variable for Clustering Algorithms to use
X = peopleDataset

X

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15
0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
2,0.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0
60,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
61,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
62,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0


In [87]:
# KMeans Clustering Algorithm
clusters = 4
kmeans = KMeans(n_clusters = clusters)
kmeans.fit(X)

kMeansLabels = [x+1 for x in kmeans.labels_]

# Count the Results per Group
kMeansCounter = Counter(kMeansLabels)

# Analyze the Number of People that Maintained their Group
fromOriginal = [0,0,0,0]
for x in range(0, len(kMeansLabels)):
    currentGroup = int(x/16) + 1
    if(kMeansLabels[x] == currentGroup):
        fromOriginal[currentGroup-1] += 1       

# Print the Raw Output Labels from KMeans Algorithm
print(kMeansLabels)

# Pretty Print the Results
for x in range(1, clusters+1):
    print("Group %d: %d people -- From Original: %d people" % (x, kMeansCounter[x], fromOriginal[x-1]))

[1, 1, 1, 1, 4, 1, 3, 1, 1, 1, 1, 3, 1, 1, 1, 3, 2, 4, 1, 4, 4, 1, 1, 4, 3, 3, 4, 4, 4, 4, 2, 4, 4, 1, 3, 3, 3, 3, 2, 1, 4, 3, 3, 1, 2, 4, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Group 1: 19 people -- From Original: 12 people
Group 2: 20 people -- From Original: 2 people
Group 3: 12 people -- From Original: 7 people
Group 4: 13 people -- From Original: 0 people


In [88]:
# MeanShift Clustering Algorithm
currentQuantile = 0.1
n_clusters = 100
while(n_clusters > 4):
    bandwidth = estimate_bandwidth(X, quantile=currentQuantile)

    ms = MeanShift(bandwidth=bandwidth)
    ms.fit(X)

    meanShiftLabels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(meanShiftLabels)
    n_clusters = len(labels_unique)
    currentQuantile += 0.005

# Count the Results per Group
meanShiftCounter = Counter(meanShiftLabels)

# Print the Raw Output Labels from MeanShift Algorithm
print("Number of estimated clusters: %d" % n_clusters)
print(meanShiftLabels)

# Pretty Print the Results
for x in range(n_clusters):
    print("Group %d: %d people" % (x+1, meanShiftCounter[x]))

Number of estimated clusters: 1
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Group 1: 64 people


## Section 2: Clustering Analysis

**KMeans vs Mean Shift**

B) 
The results from the algorithms are run above in their corresponding cells and shown underneath. Given that the dataset is generated randomly each time the notebook is run, speaking to the specifics of the results is difficult. Instead, there was a trend that was noticed when analyzing the results over the course of a number of runs. More specifically that the KMeans algorithm for clustering tended to keep the number of individuals per group around the original number specified by the original groupings. The number of people per group typically ranged from 11-21 people giving a smaller spread, (+5/-5), than its MeanShift counterpart.

The MeanShift algorithm, due to the need for the use of a dynamic quantile to get to 4 clusters specifically, it was run in a while loop with a break condition on it. Given that this was the case, a step of 0.005 was used to increment the quantile every time with the hopes of achieving a successful clustering into specifically 4 groups. Over numerous runs, however, achieving a specific clustering of 4 groups proved not always possible with MeanShift and instead often opted for less. Further, when the data was clustered into 4 groups it often saw one group taking the majority share of people while the others were left with very few.

*As such, between KMeans and MeanShift for clustering of this particular dataset, it would be recommended to use KMeans as the more trustworthy approach to clustering.*

C)
The KMeans comes closest to the original groupings in size and shape of the individual groupings, the specifics of which are denoted under the execution cell above, while the MeanShift clustering comes nowhere close to the original in either of these attributes. MeanShift rather, clusters most of the people into the first group, leaving the other three groups with very few members.

D)

