# An exploration of dimensionality reduction to understand collaboration in authentic settings

This notebook explore the use of dimension reduction methods in mapping the collected multimodal data (from collocated collaboration settings) into lower number of attribtues. This study has considered PCA, Isomap, t-SNE, FA, MFA, MCA methods. We also checked manually the resultant dimensions with original attributes which has suggested the use of first dimension as Engagement and second dimension as Physical v-s Technical behavior.

### Proposed Approach based on Dimensionality Reduction

In [1]:
import pandas as pd
import numpy as np
import prince

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.manifold import TSNE

from sklearn.preprocessing import StandardScaler

In [3]:

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

# open dataset file
data_18oct = pd.read_csv('ProcessedData_ProjectDay_20171018.csv')
data_8nov = pd.read_csv('ProcessedData_ProjectDay_20171108.csv')
data_22nov = pd.read_csv('ProcessedData_ProjectDay_20171122.csv')
data_6dec = pd.read_csv('ProcessedData_ProjectDay_20171206.csv')


# Removing missing data 
data_22nov = data_22nov.dropna(axis=0,how='any')


# Data sample
data_18oct.head(1)

Unnamed: 0.1,Unnamed: 0,timestamp,group,disengaged,looking,talking,intTech,intRes,intExt,Accessed,Create,Open,Update
0,1,2017-10-18 10:15:16,1AB,0.0,0.0,1.0,1.0,0.0,0.0,2,0,0,0


In [4]:
# Remove the first three columns from the data (Unnamed, timestamp, group)
data_18oct_copy = data_18oct.drop(data_18oct.columns[[0,1,2]],axis=1)
data_8nov_copy = data_8nov.drop(data_8nov.columns[[0,1,2]],axis=1)
data_22nov_copy = data_22nov.drop(data_22nov.columns[[0,1,2]],axis=1)
data_6dec_copy = data_6dec.drop(data_6dec.columns[[0,1]],axis=1)
# Data sample



data_18oct_copy.head(1)

Unnamed: 0,disengaged,looking,talking,intTech,intRes,intExt,Accessed,Create,Open,Update
0,0.0,0.0,1.0,1.0,0.0,0.0,2,0,0,0


In [6]:
# Standardize the data for PCA
scaler = StandardScaler()
data_18oct_std = pd.DataFrame(scaler.fit_transform(data_18oct_copy))
data_8nov_std = pd.DataFrame(scaler.fit_transform(data_8nov_copy))
data_22nov_std = pd.DataFrame(scaler.fit_transform(data_22nov_copy))
data_6dec_std = pd.DataFrame(scaler.fit_transform(data_6dec_copy))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [7]:
# PCA 
pca = PCA(n_components=2)

pca_result_18oct = pca.fit_transform(data_18oct_std)
pca_result_8nov = pca.fit_transform(data_8nov_std)
pca_result_22nov = pca.fit_transform(data_22nov_std)
pca_result_6dec = pca.fit_transform(data_6dec_std)

# Isomap
iso = Isomap(n_components=2)
iso_result_18oct = iso.fit_transform(data_18oct_std)
iso_result_8nov = iso.fit_transform(data_8nov_std)
iso_result_22nov = iso.fit_transform(data_22nov_std)
iso_result_6dec = iso.fit_transform(data_6dec_std)

#t-SNE
tsne = TSNE(n_components=2)
tsne_result_18oct = tsne.fit_transform(data_18oct_std)
tsne_result_8nov = tsne.fit_transform(data_8nov_std)
tsne_result_22nov = tsne.fit_transform(data_22nov_std)
tsne_result_6dec = tsne.fit_transform(data_6dec_std)



#####  MFA method
We have used prince(https://github.com/MaxHalford/prince) library for MFA method. We have computed the resultant dimensions in separate script (As anaconda has no support for prince library). 

In [11]:
# Seperate script used to apply MFA method and results were saved in mfa_result.csv file
mfa_result_18oct = pd.read_csv('mfa_result_18oct.csv')
mfa_result_18oct.drop(mfa_result_18oct.columns[0],axis=1,inplace=True)

mfa_result_8nov = pd.read_csv('mfa_result_8nov.csv')
mfa_result_8nov.drop(mfa_result_18oct.columns[0],axis=1,inplace=True)

mfa_result_22nov = pd.read_csv('mfa_result_22nov.csv')
mfa_result_22nov.drop(mfa_result_18oct.columns[0],axis=1,inplace=True)

mfa_result_6dec = pd.read_csv('mfa_result_6dec.csv')
mfa_result_6dec.drop(mfa_result_6dec.columns[0],axis=1,inplace=True)
#mfa_result = pd.read_csv('mfa_result.csv')
#mfa_result.drop(mfa_result.columns[0],axis=1,inplace=True)

#### Correlation 

We have checked the correlation of resultant attributes and original attributes

In [12]:
data_18oct_copy['pca-1'] = pca_result_18oct[:,0]
data_18oct_copy['pca-2'] = pca_result_18oct[:,1]

data_18oct_copy['iso-1'] = iso_result_18oct[:,0]
data_18oct_copy['iso-2'] = iso_result_18oct[:,1]

data_18oct_copy['tsne-1'] = tsne_result_18oct[:,0]
data_18oct_copy['tsne-2'] = tsne_result_18oct[:,1]

data_18oct_copy['mfa-1'] = mfa_result_18oct.iloc[:,0]
data_18oct_copy['mfa-2'] = mfa_result_18oct.iloc[:,1]

In [10]:
%matplotlib notebook

corr_matrix = data_18oct_copy.corr().iloc[0:10,0:10]
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True)
plt.savefig('Corr_18oct_attributes.png')

<IPython.core.display.Javascript object>

In [22]:
%matplotlib notebook

corr_matrix = data_18oct_copy.corr().iloc[0:10,-8:]
#sns.palplot(sns.color_palette("RdBu_r",7))
plt.figure(figsize=(11,11))
sns.set(font_scale=1.3)
sns.heatmap(corr_matrix,annot=True,cmap="RdBu_r",vmin=-1,vmax=1)
#plt.tight_layout()
plt.show()
plt.savefig('Corr_18oct_final.png')

<IPython.core.display.Javascript object>

In [23]:
data_8nov_copy['pca-1'] = pca_result_8nov[:,0]
data_8nov_copy['pca-2'] = pca_result_8nov[:,1]

data_8nov_copy['iso-1'] = iso_result_8nov[:,0]
data_8nov_copy['iso-2'] = iso_result_8nov[:,1]

data_8nov_copy['tsne-1'] = tsne_result_8nov[:,0]
data_8nov_copy['tsne-2'] = tsne_result_8nov[:,1]

data_8nov_copy['mfa-1'] = mfa_result_8nov.iloc[:,0]
data_8nov_copy['mfa-2'] = mfa_result_8nov.iloc[:,1]

In [15]:
%matplotlib notebook

corr_matrix = data_8nov_copy.corr().iloc[0:10,0:10]
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='RdBu_r')
plt.savefig('Corr_8nov_attributes.png')

<IPython.core.display.Javascript object>

In [24]:
%matplotlib notebook

corr_matrix = data_8nov_copy.corr().iloc[0:10,-8:]
plt.figure(figsize=(11,11))
sns.set(font_scale=1.3)
sns.heatmap(corr_matrix,annot=True,cmap='RdBu_r',vmin=-1,vmax=1)
plt.savefig('Corr_8nov_final.png')

<IPython.core.display.Javascript object>

In [25]:
data_22nov_copy['pca-1'] = pca_result_22nov[:,0]
data_22nov_copy['pca-2'] = pca_result_22nov[:,1]

data_22nov_copy['iso-1'] = iso_result_22nov[:,0]
data_22nov_copy['iso-2'] = iso_result_22nov[:,1]

data_22nov_copy['tsne-1'] = tsne_result_22nov[:,0]
data_22nov_copy['tsne-2'] = tsne_result_22nov[:,1]

data_22nov_copy['mfa-1'] = mfa_result_22nov.iloc[:,0]
data_22nov_copy['mfa-2'] = mfa_result_22nov.iloc[:,1]

In [19]:
%matplotlib notebook

corr_matrix = data_22nov_copy.corr().iloc[0:10,0:10]
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='RdBu_r')
plt.savefig('Corr_22nov_attributes.png')

<IPython.core.display.Javascript object>

In [26]:
%matplotlib notebook

corr_matrix = data_22nov_copy.corr().iloc[0:10,-8:]
plt.figure(figsize=(11,11))
sns.set(font_scale=1.3)
sns.heatmap(corr_matrix,annot=True,cmap='RdBu_r',vmin=-1,vmax=1)
plt.savefig('Corr_22nov_final.png')

<IPython.core.display.Javascript object>

In [27]:
data_6dec_copy['pca-1'] = pca_result_6dec[:,0]
data_6dec_copy['pca-2'] = pca_result_6dec[:,1]

data_6dec_copy['iso-1'] = iso_result_6dec[:,0]
data_6dec_copy['iso-2'] = iso_result_6dec[:,1]

data_6dec_copy['tsne-1'] = tsne_result_6dec[:,0]
data_6dec_copy['tsne-2'] = tsne_result_6dec[:,1]

data_6dec_copy['mfa-1'] = mfa_result_6dec.iloc[:,0]
data_6dec_copy['mfa-2'] = mfa_result_6dec.iloc[:,1]

In [22]:
%matplotlib notebook

corr_matrix = data_6dec_copy.corr().iloc[0:7,0:7]
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='RdBu_r')
plt.savefig('Corr_6dec_attributes.png')

<IPython.core.display.Javascript object>

In [35]:
%matplotlib notebook

corr_matrix = data_6dec_copy.corr().iloc[0:7,-8:]
plt.figure(figsize=(11,11))
sns.set(font_scale=1.3)
sns.heatmap(corr_matrix,annot=True,cmap='RdBu_r',vmin=-1,vmax=1)
plt.yticks(rotation=0)
plt.savefig('Corr_6dec_final_1.png')

<IPython.core.display.Javascript object>

### Engagement over time


In [29]:
%matplotlib notebook
import matplotlib as mpl
from datetime import datetime


data_18oct['mfa-1'] = mfa_result.iloc[:,0]
data_18oct['mfa-2'] = mfa_result.iloc[:,1] 


key = '2017-11-22'
d = datetime.strptime(key,'%Y-%m-%d')
data_18oct['timestamp'] = pd.to_datetime(data_18oct['timestamp'])



min_dim1 = data_18oct['mfa-1'].min()
max_dim1 = data_18oct['mfa-1'].max()

min_dim2 = data_18oct['mfa-2'].min()
max_dim2 = data_18oct['mfa-2'].max()

"""


# Using floor for min and max values of dim1 and dim2
min_dim1 = int(min_dim1//1)
max_dim1 = int(max_dim1//1)

min_dim2 = int(min_dim2//1)
max_dim2 = int(max_dim2//1)



temp_frame = data_18oct


avg_dim1 = temp_frame['mfa-1'].mean()
avg_dim2 = temp_frame['mfa-2'].mean()


print (temp_frame.group.unique())
for group in temp_frame.group.unique():
    temp_g = temp_frame.loc[temp_frame.group == group,:]
    for attr in ['mfa-1','mfa-2']:
        locator = mpl.dates.MinuteLocator(interval=5)
       
        temp_g.plot('timestamp',attr,marker='o',ms=3)
        plt.axhline(linewidth=.5, color='k')
        plt.axhline(y=avg_dim1,linewidth=.4,color='b',linestyle='dashed',label='average')
        plt.gca().xaxis.set_major_formatter(mpl.dates.DateFormatter('%H:%M'))
        plt.gca().xaxis.set_major_locator(locator)
      
        if attr=='mfa-1':
            label = 'Engagement'
            plt.yticks(np.arange(-4,4+.5,.5))
        else:
            label = 'Physical-Technology'
            plt.yticks(np.arange(-3,3+.5,.5))
        
        plt.legend([label])
        plt.ylabel(label)
        plt.xlabel('Time')
        plt.title('Date:%s-%s-%s Group:%s'%(d.day,d.month,d.year,group))
        plt.savefig('_%s_%s'%(group,label))
plt.show()
"""

"\n\n\n# Using floor for min and max values of dim1 and dim2\nmin_dim1 = int(min_dim1//1)\nmax_dim1 = int(max_dim1//1)\n\nmin_dim2 = int(min_dim2//1)\nmax_dim2 = int(max_dim2//1)\n\n\n\ntemp_frame = data_18oct\n\n\navg_dim1 = temp_frame['mfa-1'].mean()\navg_dim2 = temp_frame['mfa-2'].mean()\n\n\nprint (temp_frame.group.unique())\nfor group in temp_frame.group.unique():\n    temp_g = temp_frame.loc[temp_frame.group == group,:]\n    for attr in ['mfa-1','mfa-2']:\n        locator = mpl.dates.MinuteLocator(interval=5)\n       \n        temp_g.plot('timestamp',attr,marker='o',ms=3)\n        plt.axhline(linewidth=.5, color='k')\n        plt.axhline(y=avg_dim1,linewidth=.4,color='b',linestyle='dashed',label='average')\n        plt.gca().xaxis.set_major_formatter(mpl.dates.DateFormatter('%H:%M'))\n        plt.gca().xaxis.set_major_locator(locator)\n      \n        if attr=='mfa-1':\n            label = 'Engagement'\n            plt.yticks(np.arange(-4,4+.5,.5))\n        else:\n            lab