
# Analysis Comparison between Original and Expanded Lexicons on Cohort Analysis Task

Subreddit: CPTSD

In [1]:
from pathlib import Path
import pandas as pd

from config import global_config
from lexicons2 import Values, ValuesExpanded, Liwc2015, Liwc2015Expanded
from spaces import WordEmbeddings, LabelEmbeddings

In [2]:
COHORT_NAME = 'CPTSD'

In [3]:
cohort_corpus_path = global_config.reddit.submissions[COHORT_NAME]
cohort_models_path = f'{global_config.paths.models}/{COHORT_NAME}'

control_corpus_path = global_config.reddit.submissions['IAMA']
control_models_path = f'{global_config.paths.models}/IAMA'

In [4]:
cohort_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_ranks.csv'
expanded_cohort_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_ranks_expanded.csv'

control_ranks_csv = f'{global_config.paths.ranks}/IAMA_ranks.csv'
expanded_control_ranks_csv = f'{global_config.paths.ranks}/IAMA_ranks_expanded.csv'

relative_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_relative-ranks.csv'
expanded_relative_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_relative-ranks_expanded.csv'

Build Control Spaces

In [5]:
control_word_space = WordEmbeddings(corpus_path=control_corpus_path, model_path=control_models_path).build()

In [6]:
org_control_label_space = LabelEmbeddings(lexicons=[Liwc2015(), Values()], word_embeddings=control_word_space).build()

In [7]:
expanded_control_label_space = LabelEmbeddings(lexicons=[Liwc2015Expanded(), ValuesExpanded()], word_embeddings=control_word_space).build()

Build Cohort Spaces

In [8]:
cohort_word_space = WordEmbeddings(corpus_path=cohort_corpus_path, model_path=cohort_models_path).build()

In [9]:
org_cohort_label_space = LabelEmbeddings(lexicons=[Liwc2015(), Values()], word_embeddings=cohort_word_space).build()

In [10]:
expanded_cohort_label_space = LabelEmbeddings(lexicons=[Liwc2015Expanded(), ValuesExpanded()], word_embeddings=cohort_word_space).build()

Save and view results

In [11]:
def save_if_not_save_and_return(space: LabelEmbeddings, path: str):
	if not Path(path).exists():
		space.save_distances_to_csv(path)
	return pd.read_csv(path, names=['label_one', 'label_two', 'distance'])

control_ranks = save_if_not_save_and_return(org_control_label_space, control_ranks_csv)
expanded_control_ranks = save_if_not_save_and_return(expanded_control_label_space, expanded_control_ranks_csv)

cohort_ranks = save_if_not_save_and_return(org_cohort_label_space, cohort_ranks_csv)
expanded_cohort_ranks = save_if_not_save_and_return(expanded_cohort_label_space, expanded_cohort_ranks_csv)

In [12]:
org_cohort_label_space.compute_rank_deltas(org_control_label_space, relative_ranks_csv)
relative_ranks = pd.read_csv(relative_ranks_csv, names=['label_one', 'label_two', 'current_rank', 'control_rank', 'current_distance', 'control_distance', 'rank_delta', 'distance_delta'])

expanded_cohort_label_space.compute_rank_deltas(expanded_control_label_space, expanded_relative_ranks_csv)
expanded_relative_ranks = pd.read_csv(expanded_relative_ranks_csv, names=['label_one', 'label_two', 'current_rank', 'control_rank', 'current_distance', 'control_distance', 'rank_delta', 'distance_delta'])

View Stuff

In [13]:
relative_ranks

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
0,liwc2015:function,values:feeling-good,73,43,0.438239,0.499940,30,-0.061701
1,liwc2015:function,liwc2015:money,72,51,0.403190,0.681783,21,-0.278593
2,liwc2015:function,values:learning,48,28,0.060748,0.178845,20,-0.118097
3,liwc2015:function,values:truth,75,55,0.475559,0.713501,20,-0.237941
4,liwc2015:function,liwc2015:netspeak,80,62,0.747444,0.781215,18,-0.033771
...,...,...,...,...,...,...,...,...
6967,values:animals,liwc2015:ingest,9,70,1.051697,0.981867,-61,0.069829
6968,values:animals,values:truth,7,71,1.043354,0.982374,-64,0.060980
6969,values:animals,liwc2015:filler,0,69,0.911402,0.980616,-69,-0.069214
6970,values:animals,liwc2015:sexual,3,75,1.015476,1.021016,-72,-0.005540


In [14]:
expanded_relative_ranks

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
0,liwc2015:function,liwc2015:money,77,58,0.403190,0.681783,19,-0.278593
1,liwc2015:function,liwc2015:posemo,52,34,0.057296,0.259349,18,-0.202053
2,liwc2015:function,liwc2015:work,56,39,0.069775,0.336770,17,-0.266994
3,liwc2015:function,liwc2015:netspeak,83,68,0.747444,0.781215,15,-0.033771
4,liwc2015:function,liwc2015:pconcern,68,54,0.185817,0.624154,14,-0.438337
...,...,...,...,...,...,...,...,...
7135,liwc2015:filler,liwc2015:motion,26,72,0.657227,1.066124,-46,-0.408897
7136,liwc2015:filler,liwc2015:drives,1,48,0.635582,0.987538,-47,-0.351956
7137,liwc2015:filler,liwc2015:number,8,60,0.645844,1.005935,-52,-0.360090
7138,liwc2015:filler,liwc2015:certain,4,65,0.642007,1.022467,-61,-0.380461
