In [78]:
import pandas as pd
import re
import numpy as np

# Data Creation and Cleaning

##### Create tables

In [79]:
mean_df = pd.read_csv('data/mean.csv', index_col=0).T
n_df = pd.read_csv('data/n.csv', index_col=0).T
std_df = pd.read_csv('data/std.csv', index_col=0).T
labels_df = pd.read_csv('data/image_labels.csv', index_col=0)

stderr_df = std_df / np.sqrt(n_df)
stderr_df.head(3)

Trait,"psychopath,empath","punchable,loveable","arrogant,humble","😈,😇","selfish,altruistic","cruel,kind","vengeful,forgiving","quarrelsome,warm","impatient,patient","juvenile,mature",...,"feminine,masculine","'right-brained','left-brained'","avant-garde,classical","🎩,🧢","outsider,insider","industrial,domestic","complicated,simple","literary,mathematical","slow,fast","careful,brave"
A/1/,5.578319,10.573631,3.308922,6.026067,2.543995,2.568643,2.515576,2.695635,3.168104,2.152981,...,1.597625,3.889602,2.394826,7.2225,3.219938,2.182402,3.511505,3.335183,3.092329,3.23648
A/2/,3.226171,7.65,2.585991,5.06487,1.727468,2.204796,2.609496,2.551628,3.430799,1.360332,...,2.151796,4.018724,2.249862,6.737143,3.297506,1.66573,2.7301,2.397082,2.076212,1.954638
A/3/,7.066667,7.794229,3.528119,5.739131,3.124203,2.717465,3.571535,3.23749,2.605233,2.225619,...,3.04791,3.243723,2.344876,5.189798,3.729672,2.762733,3.301433,3.919961,3.251203,3.133916


In [80]:
stderr_df.shape

(1601, 400)

##### Remove duplicate columns

In [81]:
mean_df = mean_df.loc[:,~mean_df.columns.duplicated()]
stderr_df = stderr_df.loc[:,~stderr_df.columns.duplicated()]

##### Standardize index names

In [82]:
cols = ['race.label', 'race.confidence', 'gender.label', 'gender.confidence']
labels_df = labels_df[cols]

In [83]:
index = mean_df.index.str.extract(r'([A-z]+).([0-9]+)')
mean_df.index = index[0] + index[1]
stderr_df.index = index[0] + index[1]
labels_df.index = labels_df.index.str.extract(r'([A-z0-9]+)').iloc[:,0]

##### Remove emoji and non-word columns

In [84]:
stderr_df = stderr_df.loc[:,stderr_df.columns.str.match("[A-Za-z\- ]+,[A-Za-z\- ]+")]
mean_df = mean_df.loc[:,mean_df.columns.str.match("[A-Za-z\- ]+,[A-Za-z\- ]+")]

##### Merge face labels

In [85]:
mean_df = mean_df.merge(labels_df, left_index=True, right_index=True)
mean_df.head(3)

Unnamed: 0,"psychopath,empath","punchable,loveable","arrogant,humble","selfish,altruistic","cruel,kind","vengeful,forgiving","quarrelsome,warm","impatient,patient","juvenile,mature","entitled,grateful",...,"outsider,insider","industrial,domestic","complicated,simple","literary,mathematical","slow,fast","careful,brave",race.label,race.confidence,gender.label,gender.confidence
A1,55.5,31.0,51.7,59.7,63.1,60.1,45.0,48.6,73.3,34.7,...,60.5,29.6,52.7,65.8,53.0,57.8,asian,1.0,male,1.0
A2,86.7,71.0,59.2,81.6,74.3,29.4,35.3,53.3,85.0,71.7,...,38.9,22.9,28.8,75.9,83.8,85.9,white,1.0,female,0.05
A3,66.2,57.2,46.3,45.1,62.9,47.2,42.6,34.0,43.2,47.9,...,49.3,42.6,59.4,58.9,44.4,35.7,white,1.0,male,0.21


In [86]:
stderr_df.shape

(1601, 362)

##### Drop nonsense columns

In [87]:
# Incorrect columns
drop_cols = ['genocidal,genocidal', 'introspective,introspective']
mean_df = mean_df.drop(drop_cols, axis=1)
stderr_df = stderr_df.drop(drop_cols, axis=1)

##### Drop characters with less than confidence_cutoff confidence for image labels

In [88]:
confidence_cutoff = .80
mean_df = mean_df[(mean_df['race.confidence'] >= confidence_cutoff) & (mean_df['gender.confidence'] >= confidence_cutoff)]
stderr_df = stderr_df.loc[mean_df.index,:]
mean_df.head(3)

Unnamed: 0,"psychopath,empath","punchable,loveable","arrogant,humble","selfish,altruistic","cruel,kind","vengeful,forgiving","quarrelsome,warm","impatient,patient","juvenile,mature","entitled,grateful",...,"outsider,insider","industrial,domestic","complicated,simple","literary,mathematical","slow,fast","careful,brave",race.label,race.confidence,gender.label,gender.confidence
A1,55.5,31.0,51.7,59.7,63.1,60.1,45.0,48.6,73.3,34.7,...,60.5,29.6,52.7,65.8,53.0,57.8,asian,1.0,male,1.0
A4,16.9,18.9,20.9,29.4,26.2,27.5,25.1,61.8,74.4,7.7,...,41.7,15.4,30.3,87.7,63.3,26.2,white,1.0,male,0.94
AD1,75.3,51.9,45.9,61.5,68.2,64.9,40.4,55.0,72.8,38.0,...,44.1,61.8,44.0,76.0,65.2,23.8,white,1.0,male,0.89


In [89]:
mean_df.shape

(858, 364)

##### Drop rows and columns with high standard error

In [90]:
# TODO: Delete this. Just useful for playing with the 
# percentiles below without having to rerun whole notebook
stderr_copy = stderr_df.copy()

In [91]:
# TODO: Delete this. Just useful for playing with the 
# percentiles below without having to rerun whole notebook
stderr_df = stderr_copy.copy()

# Drop cells that have stderr greater than the percentile specified
stderr_cutoff_percentile = 95
cutoff = np.percentile(stderr_df.stack(), q=stderr_cutoff_percentile)
stderr_df = stderr_df.applymap(lambda x: np.nan if x > cutoff else x)
na_counts = stderr_df.apply(lambda x: x.isna().sum())

# Now drop the columns with the most NAs. This is a way of ensuring
# that we do not drop too many rows
# e.g. if there is a column with ALL NAs, we would drop every row
col_cutoff_percentile = 20
col_na_cutoff = np.percentile(na_counts, q=col_cutoff_percentile)
for col in stderr_df.columns:
    if na_counts[col] > col_na_cutoff:
        stderr_df.drop(col, axis=1, inplace=True)
stderr_df = stderr_df.dropna()
'(Characters, traits) = ' + str(stderr_df.shape)

'(Characters, traits) = (947, 72)'

In [92]:
mean_df.to_pickle('data/merged_mean_data.pkl')