In [534]:
#Import Statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth

In [535]:
#Import and Cleaning of the Dataset
df = pd.read_excel(r'dataset.xlsx', sheet_name='Sheet1', header=[0,1])

# Removing the Null Column and Grouping Column
del df['Unnamed: 0_level_0']
del df['Unnamed: 1_level_0']

# Change from Percentages to Number of People
numberDf = df.apply(lambda x: round(16*(x/100)))

# Conditions for the setup
peoplePerGroup = 16
numGroups = 4
numQuestions = 15

# Generating the array
peopleSamples = np.zeros((peoplePerGroup * numGroups, numQuestions))

# Building the Dataset
for row in range(numGroups):
    # numberDf.iloc[row:row+1, :] => Gets Row
    #print(numberDf.iloc[row:row+1, :])
    questionIndex = 0
    for question in range(0, len(numberDf.columns)-3+1, 3):
        personIndex = 0 + (peoplePerGroup * row)
        for response in range(0, 3):
            numResponses = int(numberDf.iloc[row, question+response])
            for person in range(0, numResponses):
                peopleSamples[personIndex][questionIndex] = int(response)
                personIndex += 1
        questionIndex += 1

for row in range(len(peopleSamples)):
    for column in range(0, 15):
        np.random.shuffle(peopleSamples[(row*16):((row+1)*16),column]) 

# Conversion from np.Array to Dataframe
peopleDataset = pd.DataFrame({'Q1':peopleSamples[:, 0],'Q2':peopleSamples[:, 1],'Q3':peopleSamples[:, 2],
                        'Q4':peopleSamples[:, 3],'Q5':peopleSamples[:, 4],'Q6':peopleSamples[:, 5],
                        'Q7':peopleSamples[:, 6],'Q8':peopleSamples[:, 7],'Q9':peopleSamples[:, 8],
                        'Q10':peopleSamples[:, 9],'Q11':peopleSamples[:, 10],'Q12':peopleSamples[:, 11],
                        'Q13':peopleSamples[:, 12],'Q14':peopleSamples[:, 13],'Q15':peopleSamples[:, 14]})

# Assigning to variable for Clustering Algorithms to use
X = peopleDataset

X

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15
0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0
1,0.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0
60,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
61,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
62,1.0,1.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [536]:
# KMeans Clustering Algorithm
clusters = 4
kmeans = KMeans(n_clusters = clusters)
kmeans.fit(X)

print(kmeans.labels_)

[1 2 1 3 2 1 2 2 3 2 1 3 1 3 2 2 3 0 3 2 2 3 3 3 3 3 2 3 1 1 3 3 0 3 1 3 1
 2 0 0 3 1 3 1 3 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [537]:
# MeanShift Clustering Algorithm
currentQuantile = 0.08
n_clusters = 100
while(n_clusters > 4):
    bandwidth = estimate_bandwidth(X, quantile=currentQuantile)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

    labels = ms.labels_
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    n_clusters = len(labels_unique)
    currentQuantile += 0.00002

print("number of estimated clusters : %d" % n_clusters)
print(labels)

number of estimated clusters : 2
[0 1 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0
 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
