## Libraries

In [29]:
import os
import collections
import numpy as np
import pandas as pd

In [30]:
# additional libraries
! pip install icalendar




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: C:\Users\louis\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Paths

In [31]:
# base folder path
base_path = ''

# if we are on google colab, we mount the drive
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_path = './drive/Shareddrives/2020-Makerspace-tracking'

# if we are running it locally, we use the standard gdrive path
else:
    curr_dir = os.getcwd().split('2020-Makerspace-tracking')[0]
    base_path = os.path.join(curr_dir,'2020-Makerspace-tracking')

In [32]:
# folders we'll be working with
agg_path = os.path.join(base_path, 'Data', '2022-Spr-T519', 'aggregated')
data_path = os.path.join(base_path, 'Data', '2022-Spr-T519', 'poseconnect')
#analysis_path = os.path.join(base_path, 'Analysis', '2022-Spr-Week7')

## Helpers

In [52]:
# load the script
script = os.path.join(base_path, 'Analysis', 'helpers', '[ Archives ]', 'augment_df.py')
%run "$script"

In [53]:
# load the script for generating correlation heatmaps
script_heatmap = os.path.join(base_path, 'Analysis', 'helpers', 'heatmap.py')
%run "$script_heatmap"

## Data

### Survey

In [35]:
scores_path = os.path.join(agg_path, 'outcome_participants_scores.csv')
scores_df = pd.read_csv(scores_path)
scores_df.head()

Unnamed: 0.1,Unnamed: 0,student_id,email,mid_gain_se,final_raw_se,total_gain_se,mid_gain_com,mid_gain_se_norm,mid_gain_com_norm,final_raw_com,total_gain_com,score,enjoyment,stress_level,time_on_assignment
0,0,aashna,aashnasaraf@gse.harvard.edu,-5,8,-4,0.375,0.25,0.409091,1.75,0.5,0.164773,2.769231,2.153846,5.923077
1,1,conner,ceastman@gse.harvard.edu,-7,14,-1,2.0,0.083333,1.0,2.0,2.0,0.479167,2.384615,2.615385,5.769231
2,2,chali,chalisakaewla@gse.harvard.edu,-1,10,6,0.5,0.583333,0.454545,1.5,1.5,0.50947,2.0,1.846154,4.384615
3,3,denise,denisefabella@gse.harvard.edu,-4,7,-2,-0.0625,0.333333,0.25,1.125,0.375,0.4375,3.153846,2.230769,4.384615
4,4,helen,helen_turner@gse.harvard.edu,3,16,6,-0.75,0.916667,0.0,1.5,-0.5,0.645833,3.071429,1.928571,4.285714


### Sensor data

In [37]:
# go through the poseconnect data
folder = os.path.join(data_path, 'poseconnect_cleaned')
for dir in os.listdir(folder):
    if '2022' in dir:
        subfolder = os.path.join(folder, dir)
        for subfile in os.listdir(subfolder):

            # we only care about the 3d reconstructed data
            if subfile.endswith('.csv') and '3d' in subfile:
                path = os.path.join(subfolder, subfile)
                csv = path.replace('3d_', 'summary_')

                # if the summary file already exists, we skip it
                if os.path.isfile(csv): continue

                # we read the data and add AOI columns
                data = pd.read_csv(path)
                add_aoi_to_df(data)

                # summarize the data by student, hour, aoi and save it
                summary = data.groupby(['student_id','timestamp', 'aoi']).size().unstack()
                summary.to_csv(csv)

### Combine the two together

In [38]:
import glob
import pathlib

csv_files = list(pathlib.Path(folder).rglob('*.csv'))
summary_files = [x for x in csv_files if 'summary_' in str(x)]
summary_files

[WindowsPath('c:/Users/louis/Desktop/Semester-Project/2020-Makerspace-tracking/Data/2022-Spr-T519/poseconnect/poseconnect_cleaned/2022-03-03/summary_2022-03-03.csv'),
 WindowsPath('c:/Users/louis/Desktop/Semester-Project/2020-Makerspace-tracking/Data/2022-Spr-T519/poseconnect/poseconnect_cleaned/2022-03-04/summary_2022-03-04.csv'),
 WindowsPath('c:/Users/louis/Desktop/Semester-Project/2020-Makerspace-tracking/Data/2022-Spr-T519/poseconnect/poseconnect_cleaned/2022-03-05/summary_2022-03-05.csv'),
 WindowsPath('c:/Users/louis/Desktop/Semester-Project/2020-Makerspace-tracking/Data/2022-Spr-T519/poseconnect/poseconnect_cleaned/2022-03-06/summary_2022-03-06.csv'),
 WindowsPath('c:/Users/louis/Desktop/Semester-Project/2020-Makerspace-tracking/Data/2022-Spr-T519/poseconnect/poseconnect_cleaned/2022-03-07/summary_2022-03-07.csv'),
 WindowsPath('c:/Users/louis/Desktop/Semester-Project/2020-Makerspace-tracking/Data/2022-Spr-T519/poseconnect/poseconnect_cleaned/2022-03-08/summary_2022-03-08.csv')

In [51]:
# combine all the dfs together
main_df = None

for csv in summary_files:
    df = pd.read_csv(csv)
    df.insert(0,'file', csv)
    if type(main_df) == type(None): main_df = df
    else: main_df = pd.concat(main_df, df, ignore_index=True)

#main_df = main_df.groupby(['student_id']).sum()
main_df

student_id
aashna        76841
alaa         204185
bertrand       9477
chali        773104
conner       766976
daniel       593834
denise       250627
hoa          425185
iulian        69162
ji su        502828
juan         784199
marc         399945
melissa      327055
miaoya       185338
natalie      741219
rachel       178911
rebecca      360030
rhea         687254
rui          644721
sara         193303
xiaoyi       454528
yani        1124619
dtype: int64


In [47]:
master_df = main_df.merge(scores_df, on='student_id')
master_df

Unnamed: 0.1,student_id,is_with_count,Unnamed: 0,email,mid_gain_se,final_raw_se,total_gain_se,mid_gain_com,mid_gain_se_norm,mid_gain_com_norm,final_raw_com,total_gain_com,score,enjoyment,stress_level,time_on_assignment
0,aashna,76841,0,aashnasaraf@gse.harvard.edu,-5,8,-4,0.375,0.25,0.409091,1.75,0.5,0.164773,2.769231,2.153846,5.923077
1,chali,773104,2,chalisakaewla@gse.harvard.edu,-1,10,6,0.5,0.583333,0.454545,1.5,1.5,0.50947,2.0,1.846154,4.384615
2,conner,766976,1,ceastman@gse.harvard.edu,-7,14,-1,2.0,0.083333,1.0,2.0,2.0,0.479167,2.384615,2.615385,5.769231
3,denise,250627,3,denisefabella@gse.harvard.edu,-4,7,-2,-0.0625,0.333333,0.25,1.125,0.375,0.4375,3.153846,2.230769,4.384615
4,hoa,425185,5,hoapham@gse.harvard.edu,-5,8,-5,-0.6875,0.25,0.022727,2.0,0.0,0.318182,3.153846,2.0,5.923077
5,ji su,502828,6,jlee@gse.harvard.edu,-1,15,1,0.125,0.583333,0.318182,1.6875,0.1875,0.642045,3.230769,1.769231,4.230769
6,juan,784199,7,juanpablo_garcesramirez@gse.harvard.edu,0,8,-1,0.25,0.666667,0.363636,2.0,1.125,0.590909,3.857143,2.357143,6.142857
7,melissa,327055,9,mkain@gse.harvard.edu,0,14,1,1.5,0.666667,0.818182,1.125,1.125,0.454545,2.538462,1.538462,4.076923
8,miaoya,185338,8,miaoyazhong@gse.harvard.edu,3,15,10,0.875,0.916667,0.590909,2.0,0.875,0.793561,3.230769,2.923077,3.461538
9,natalie,741219,10,nvarkey@gse.harvard.edu,-1,15,7,-0.0625,0.583333,0.25,2.0,1.0,0.583333,3.076923,2.692308,5.0


## Correlations

In [48]:
master_df.columns

Index(['student_id', 'is_with_count', 'Unnamed: 0', 'email', 'mid_gain_se',
       'final_raw_se', 'total_gain_se', 'mid_gain_com', 'mid_gain_se_norm',
       'mid_gain_com_norm', 'final_raw_com', 'total_gain_com', 'score',
       'enjoyment', 'stress_level', 'time_on_assignment'],
      dtype='object')

In [55]:
# define our predictors (rows) and outcomes (columns)
predictors = ['student_id']
outcomes = ['mid_gain_se', 'mid_gain_com', 'enjoyment', 'stress_level','mid_gain_se_norm', 'mid_gain_com_norm', 'enjoyment','stress_level', 'score']

compute_correlation(master_df, predictors, outcomes)

ValueError: could not convert string to float: 'aashna'