In [22]:
# Import required package

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import decomposition
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE
from sklearn import preprocessing

import numpy as np

In [2]:
# read the dataset file
data = pd.read_csv('Data_16nov.csv')

# Deleting the first columns of sequence number
data.drop(data.columns[0],axis=1,inplace=True)

# Printing basic info of dataset
print 'Number of records:',data.shape[0]
print 'Number of attributes:',data.shape[1]


# Printing Column names
print([a for a in data.columns])

Number of records: 77
Number of attributes: 12
['timestamp', 'group', 'disengaged', 'looking', 'talking', 'intTech', 'intRes', 'intExt', 'Accessed', 'Create', 'Open', 'Update']


In [3]:
# Printing sample of the dataset

data.head()
raw_data = data.drop(['timestamp','group'],axis=1)

In [4]:
# Stadardizing the dataset
std_rawdata = preprocessing.StandardScaler().fit_transform(raw_data)

# Minmax scaling of dataset
min_rawdata = preprocessing.MinMaxScaler().fit_transform(raw_data)

In [5]:
pca = decomposition.PCA(n_components = 2)
iso = Isomap(n_components = 2)
fa = decomposition.FactorAnalysis(n_components = 2)
tsne = TSNE(n_components = 2)

In [9]:
%matplotlib notebook
pca1 = pca.fit(std_rawdata)
pca_data2d = pca.fit_transform(std_rawdata)
iso_data2d = iso.fit_transform(std_rawdata)
fa_data2d = fa.fit_transform(std_rawdata)
tsne_data2d = tsne.fit_transform(std_rawdata)

# on minmax scaling data
#pca_data2d = pca.fit_transform(min_rawdata)
#iso_data2d = iso.fit_transform(min_rawdata)
#fa_data2d = fa.fit_transform(min_rawdata)
#tsne_data2d = tsne.fit_transform(min_rawdata)





raw_data['pc-1'] = pca_data2d[:,0]
raw_data['pc-2'] = pca_data2d[:,1]

raw_data['is-1'] = iso_data2d[:,0]
raw_data['is-2'] = iso_data2d[:,1]

raw_data['fa-1'] = fa_data2d[:,0]
raw_data['fa-2'] = fa_data2d[:,1]

raw_data['tsn-1'] = tsne_data2d[:,0]
raw_data['tsn-2'] = tsne_data2d[:,1]



# Adding MCA results ( without converting into categorical )
mca_df = pd.read_csv('mcaresult.csv')





# Adding results from MCA and MFA
#mca_df = pd.read_csv('mcaresult2.csv')

mfa_df = pd.read_csv('mfaresult2.csv')

raw_data['mca-1'] = mca_df.iloc[:,1]
raw_data['mca-2'] = mca_df.iloc[:,2]

raw_data['mfa-1'] = mfa_df.iloc[:,1]
raw_data['mfa-2'] = mfa_df.iloc[:,2]

#print(raw_data['mca-1'])

#print(raw_data['pc-1'])

plt.figure(figsize=(10,9))
sns.heatmap(raw_data.corr().iloc[0:10,-12:],annot=True)
print(pca1.components_)

<IPython.core.display.Javascript object>

[[ 0.00222117 -0.00161973  0.03866683  0.13093958  0.03791666  0.2377034
   0.36379403  0.42635986  0.59709448  0.50275335]
 [ 0.07474151 -0.49686195 -0.51259076 -0.15348951 -0.54499795 -0.33491527
   0.17915377  0.1399878   0.01624903  0.00926868]]


## Clustering the PCA and MFA dimensions

In this section, we will explore the clustering results on the dimensions obtained by applying the PCA and MFA. The rationale behind opting these two dimension reduction techniques is correlation among resultant dimensions with original dimensions.

In [45]:
# Kmeans over PCA results
kmeans1 = KMeans(n_clusters = 3)
kmeans1.fit(raw_data.loc[:,'pc-1':'pc-2'])
fig1 = plt.figure()
ax = fig1.add_subplot(111)
ax.scatter(raw_data.loc[:,'pc-1'],raw_data.loc[:,'pc-2'],c=kmeans1.labels_)

#for i in range(len(data)):
#    ax.annotate((data.iloc[i,1],i),xy=(raw_data.iloc[i,10],raw_data.iloc[i,11]),xytext=(raw_data.iloc[i,10],raw_data.iloc[i,11]))

plt.scatter(kmeans1.cluster_centers_[:,0] ,kmeans1.cluster_centers_[:,1], color='red')
cluster_names = ['physical','neutral','online']
for i in range(len(kmeans1.cluster_centers_)):
    ax.annotate(cluster_names[i],xy=(kmeans1.cluster_centers_[i,0] ,kmeans1.cluster_centers_[i,1]),xytext=(kmeans1.cluster_centers_[i,0] ,kmeans1.cluster_centers_[i,1]))

plt.xlabel('on-system engagement')
plt.ylabel('off-system engagement')
plt.title('Clustering on PCA data')
plt.show()

<IPython.core.display.Javascript object>

In [7]:
# Converting continuous variables into categorical
cat_data = data.copy()

In [8]:
cat_data.head()

Unnamed: 0,timestamp,group,disengaged,looking,talking,intTech,intRes,intExt,Accessed,Create,Open,Update
0,2017-10-18 10:15:16,1AB,0.0,0.0,1.0,1.0,0.0,0.0,2,0,0,0
1,2017-10-18 10:20:11,1AB,0.0,0.0,1.0,1.0,0.0,0.0,6,8,11,0
2,2017-10-18 10:25:04,1AB,0.0,0.0,1.0,0.5,1.0,1.0,0,0,0,0
3,2017-10-18 10:30:19,1AB,0.0,0.5,1.0,0.5,1.0,0.5,3,2,0,0
4,2017-10-18 10:35:21,1AB,0.0,0.5,1.0,1.0,0.0,0.0,0,0,0,0


In [9]:
cat_data['accessed_cat'] = pd.cut(data['Accessed'],bins=3,labels=False)
cat_data['create_cat'] = pd.cut(data['Create'],bins=3,labels=False)
cat_data['open_cat'] = pd.cut(data['Open'],bins=3,labels=False)
cat_data['update_cat'] = pd.cut(data['Update'],bins=3,labels=False)

In [10]:
cat_data.drop(['Accessed','Create','Open','Update'],axis=1,inplace=True)
cat_data.head()

Unnamed: 0,timestamp,group,disengaged,looking,talking,intTech,intRes,intExt,accessed_cat,create_cat,open_cat,update_cat
0,2017-10-18 10:15:16,1AB,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0,0
1,2017-10-18 10:20:11,1AB,0.0,0.0,1.0,1.0,0.0,0.0,0,1,0,0
2,2017-10-18 10:25:04,1AB,0.0,0.0,1.0,0.5,1.0,1.0,0,0,0,0
3,2017-10-18 10:30:19,1AB,0.0,0.5,1.0,0.5,1.0,0.5,0,0,0,0
4,2017-10-18 10:35:21,1AB,0.0,0.5,1.0,1.0,0.0,0.0,0,0,0,0


In [11]:
cat_data.to_csv('datawithcat3.csv')