
# Analysis Comparison between Original and Expanded Lexicons on Cohort Analysis Task

Subreddit: Depression

In [1]:
from pathlib import Path
import pandas as pd

from config import global_config
from lexicons2 import Values, ValuesExpanded, Liwc2015, Liwc2015Expanded
from spaces import WordEmbeddings, LabelEmbeddings
from statistics import compute_lexicon_voc_coverage

In [2]:
COHORT_NAME = 'depression'

In [3]:
cohort_corpus_path = global_config.reddit.submissions[COHORT_NAME]
cohort_models_path = f'{global_config.paths.models}/{COHORT_NAME}'

control_corpus_path = global_config.reddit.submissions['IAMA']
control_models_path = f'{global_config.paths.models}/IAMA'

In [4]:
cohort_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_ranks.csv'
expanded_cohort_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_ranks_expanded.csv'

control_ranks_csv = f'{global_config.paths.ranks}/IAMA_ranks.csv'
expanded_control_ranks_csv = f'{global_config.paths.ranks}/IAMA_ranks_expanded.csv'

relative_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_relative-ranks.csv'
expanded_relative_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_relative-ranks_expanded.csv'

Build Control Spaces

In [5]:
control_word_space = WordEmbeddings(corpus_path=control_corpus_path, model_path=control_models_path).build()

In [6]:
org_control_label_space = LabelEmbeddings(lexicons=[Liwc2015(), Values()], word_embeddings=control_word_space).build()

In [7]:
expanded_control_label_space = LabelEmbeddings(lexicons=[Liwc2015Expanded(), ValuesExpanded()], word_embeddings=control_word_space).build()

Build Cohort Spaces

In [8]:
cohort_word_space = WordEmbeddings(corpus_path=cohort_corpus_path, model_path=cohort_models_path).build()

In [9]:
org_cohort_label_space = LabelEmbeddings(lexicons=[Liwc2015(), Values()], word_embeddings=cohort_word_space).build()

In [10]:
expanded_cohort_label_space = LabelEmbeddings(lexicons=[Liwc2015Expanded(), ValuesExpanded()], word_embeddings=cohort_word_space).build()

Save and view results

In [11]:
def save_if_not_save_and_return(space: LabelEmbeddings, path: str):
	if not Path(path).exists():
		space.save_distances_to_csv(path)
	return pd.read_csv(path, names=['label_one', 'label_two', 'distance'])

control_ranks = save_if_not_save_and_return(org_control_label_space, control_ranks_csv)
expanded_control_ranks = save_if_not_save_and_return(expanded_control_label_space, expanded_control_ranks_csv)

cohort_ranks = save_if_not_save_and_return(org_cohort_label_space, cohort_ranks_csv)
expanded_cohort_ranks = save_if_not_save_and_return(expanded_cohort_label_space, expanded_cohort_ranks_csv)

In [12]:
org_cohort_label_space.compute_rank_deltas(org_control_label_space, relative_ranks_csv)
relative_ranks = pd.read_csv(relative_ranks_csv, names=['label_one', 'label_two', 'current_rank', 'control_rank', 'current_distance', 'control_distance', 'rank_delta', 'distance_delta'])

expanded_cohort_label_space.compute_rank_deltas(expanded_control_label_space, expanded_relative_ranks_csv)
expanded_relative_ranks = pd.read_csv(expanded_relative_ranks_csv, names=['label_one', 'label_two', 'current_rank', 'control_rank', 'current_distance', 'control_distance', 'rank_delta', 'distance_delta'])

In [13]:
relative_ranks.sort_values(by='rank_delta', ascending=True, inplace=True)
expanded_relative_ranks.sort_values(by='rank_delta', ascending=True, inplace=True)

View Stuff

In [14]:
relative_ranks

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
6390,liwc2015:netspeak,liwc2015:anx,2,81,0.640102,1.157257,-79,-0.517154
6141,values:religion,liwc2015:see,2,81,0.674108,1.118634,-79,-0.444526
6140,values:religion,liwc2015:death,4,82,0.700019,1.178087,-78,-0.478067
6886,values:accepting-others,liwc2015:bio,2,80,0.945920,1.168630,-78,-0.222710
6887,values:accepting-others,liwc2015:sad,3,81,0.954994,1.190451,-78,-0.235457
...,...,...,...,...,...,...,...,...
6142,liwc2015:assent,liwc2015:sexual,76,1,0.685680,0.751274,75,-0.065593
6143,liwc2015:assent,liwc2015:netspeak,77,2,0.764818,0.755847,75,0.008971
6808,values:accepting-others,liwc2015:motion,78,3,1.106400,0.890085,75,0.216315
6807,values:accepting-others,values:feeling-good,77,1,1.099986,0.811712,76,0.288275


In [15]:
expanded_relative_ranks

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
6971,liwc2015:netspeak,liwc2015:anx,2,82,0.640102,1.157257,-80,-0.517154
7055,liwc2015:relig,liwc2015:leisure,5,80,0.737662,1.147455,-75,-0.409793
6887,liwc2015:ingest,liwc2015:shehe,4,78,0.442647,1.089677,-74,-0.647030
7054,liwc2015:relig,liwc2015:feel,12,83,0.757671,1.212933,-71,-0.455263
7053,liwc2015:relig,liwc2015:percept,2,72,0.732030,1.029727,-70,-0.297697
...,...,...,...,...,...,...,...,...
4620,liwc2015:friend,liwc2015:nonflu,79,2,0.702106,0.762485,77,-0.060379
6300,liwc2015:body,liwc2015:sexual,79,2,0.514425,0.788858,77,-0.274432
6720,liwc2015:assent,liwc2015:sexual,80,0,0.685680,0.751274,80,-0.065593
6721,liwc2015:assent,liwc2015:netspeak,81,1,0.764818,0.755847,80,0.008971


Debugging

In [16]:
relative_ranks[relative_ranks.label_one == 'liwc2015:i']

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
331,liwc2015:i,liwc2015:feel,38,69,0.037701,0.819923,-31,-0.782221
330,liwc2015:i,liwc2015:body,50,77,0.114711,0.892732,-27,-0.778021
329,liwc2015:i,values:parents,53,70,0.132220,0.827608,-17,-0.695388
328,liwc2015:i,liwc2015:health,60,76,0.203280,0.870584,-16,-0.667304
327,liwc2015:i,liwc2015:swear,61,75,0.217005,0.865515,-14,-0.648510
...,...,...,...,...,...,...,...,...
252,liwc2015:i,liwc2015:work,51,36,0.123357,0.339661,15,-0.216304
255,liwc2015:i,values:truth,71,56,0.383042,0.698592,15,-0.315551
251,liwc2015:i,values:animals,81,65,0.918565,0.790903,16,0.127662
250,liwc2015:i,values:feeling-good,63,43,0.245943,0.517573,20,-0.271630


In [17]:
expanded_relative_ranks[expanded_relative_ranks.label_one == 'liwc2015:i']

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
335,liwc2015:i,liwc2015:feel,43,73,0.037701,0.819923,-30,-0.782221
334,liwc2015:i,liwc2015:body,58,79,0.114711,0.892732,-21,-0.778021
333,liwc2015:i,liwc2015:bio,59,74,0.117285,0.826260,-15,-0.708976
331,liwc2015:i,liwc2015:health,68,78,0.203280,0.870584,-10,-0.667304
332,liwc2015:i,liwc2015:shehe,17,27,0.006368,0.155960,-10,-0.149591
...,...,...,...,...,...,...,...,...
256,liwc2015:i,liwc2015:male,65,53,0.183596,0.618264,12,-0.434668
255,liwc2015:i,liwc2015:posemo,47,34,0.047905,0.267308,13,-0.219403
254,liwc2015:i,liwc2015:netspeak,82,68,0.673767,0.780373,14,-0.106605
253,liwc2015:i,liwc2015:money,72,58,0.277020,0.680134,14,-0.403114


In [18]:
org_coverage = compute_lexicon_voc_coverage(corpus_path=cohort_corpus_path, lexicon=Liwc2015)
extended_coverage = compute_lexicon_voc_coverage(corpus_path=cohort_corpus_path, lexicon=Liwc2015Expanded)

f'Original coverage {org_coverage}, Extended coverage {extended_coverage}'

building voc for /home/mbahgat/ws/work/datasets/reddit/archives_2023/submissions_selected/txt/depression_submissions.txt: 3387391it [17:11, 3282.81it/s]


TypeError: label() missing 1 required positional argument: 'word'