
# Analysis Comparison between Original and Expanded Lexicons on Cohort Analysis Task

Subreddit: Suicide Watch

In [1]:
from pathlib import Path
import pandas as pd

from config import global_config
from lexicons2 import Values, ValuesExpanded, Liwc2015, Liwc2015Expanded
from spaces import WordEmbeddings, LabelEmbeddings
from statistics import compute_lexicon_voc_coverage

In [2]:
COHORT_NAME = 'SuicideWatch'

In [3]:
cohort_corpus_path = global_config.reddit.submissions[COHORT_NAME]
cohort_models_path = f'{global_config.paths.models}/{COHORT_NAME}'

control_corpus_path = global_config.reddit.submissions['IAMA']
control_models_path = f'{global_config.paths.models}/IAMA'

In [4]:
cohort_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_ranks.csv'
expanded_cohort_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_ranks_expanded.csv'

control_ranks_csv = f'{global_config.paths.ranks}/IAMA_ranks.csv'
expanded_control_ranks_csv = f'{global_config.paths.ranks}/IAMA_ranks_expanded.csv'

relative_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_relative-ranks.csv'
expanded_relative_ranks_csv = f'{global_config.paths.ranks}/{COHORT_NAME}_relative-ranks_expanded.csv'

Build Control Spaces

In [5]:
control_word_space = WordEmbeddings(corpus_path=control_corpus_path, model_path=control_models_path).build()

In [6]:
org_control_label_space = LabelEmbeddings(lexicons=[Liwc2015(), Values()], word_embeddings=control_word_space).build()

In [7]:
expanded_control_label_space = LabelEmbeddings(lexicons=[Liwc2015Expanded(), ValuesExpanded()], word_embeddings=control_word_space).build()

Build Cohort Spaces

In [8]:
cohort_word_space = WordEmbeddings(corpus_path=cohort_corpus_path, model_path=cohort_models_path).build()

In [9]:
org_cohort_label_space = LabelEmbeddings(lexicons=[Liwc2015(), Values()], word_embeddings=cohort_word_space).build()

In [10]:
expanded_cohort_label_space = LabelEmbeddings(lexicons=[Liwc2015Expanded(), ValuesExpanded()], word_embeddings=cohort_word_space).build()

Save and view results

In [11]:
def save_if_not_save_and_return(space: LabelEmbeddings, path: str):
	if not Path(path).exists():
		space.save_distances_to_csv(path)
	return pd.read_csv(path, names=['label_one', 'label_two', 'distance'])

control_ranks = save_if_not_save_and_return(org_control_label_space, control_ranks_csv)
expanded_control_ranks = save_if_not_save_and_return(expanded_control_label_space, expanded_control_ranks_csv)

cohort_ranks = save_if_not_save_and_return(org_cohort_label_space, cohort_ranks_csv)
expanded_cohort_ranks = save_if_not_save_and_return(expanded_cohort_label_space, expanded_cohort_ranks_csv)

In [12]:
org_cohort_label_space.compute_rank_deltas(org_control_label_space, relative_ranks_csv)
relative_ranks = pd.read_csv(relative_ranks_csv, names=['label_one', 'label_two', 'current_rank', 'control_rank', 'current_distance', 'control_distance', 'rank_delta', 'distance_delta'])

expanded_cohort_label_space.compute_rank_deltas(expanded_control_label_space, expanded_relative_ranks_csv)
expanded_relative_ranks = pd.read_csv(expanded_relative_ranks_csv, names=['label_one', 'label_two', 'current_rank', 'control_rank', 'current_distance', 'control_distance', 'rank_delta', 'distance_delta'])

In [13]:
relative_ranks.sort_values(by='rank_delta', ascending=True, inplace=True)
expanded_relative_ranks.sort_values(by='rank_delta', ascending=True, inplace=True)

View Stuff

In [14]:
relative_ranks

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
6971,values:animals,liwc2015:male,7,82,0.979822,1.117406,-75,-0.137584
6224,liwc2015:netspeak,values:feeling-good,3,74,0.550032,1.027892,-71,-0.477861
6888,values:accepting-others,values:children,6,77,1.012973,1.121012,-71,-0.108040
6970,values:animals,liwc2015:see,2,73,0.960579,1.008819,-71,-0.048241
6307,liwc2015:relig,liwc2015:risk,7,78,0.687100,1.136309,-71,-0.449209
...,...,...,...,...,...,...,...,...
1909,liwc2015:bio,liwc2015:sexual,76,4,0.409594,0.677835,72,-0.268241
5395,liwc2015:body,liwc2015:sexual,76,3,0.657257,0.788858,73,-0.131600
6806,values:accepting-others,liwc2015:male,76,2,1.114432,0.881365,74,0.233067
6059,values:religion,liwc2015:relig,75,1,0.537940,0.749945,74,-0.212005


In [15]:
expanded_relative_ranks

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
3023,liwc2015:bio,liwc2015:female,1,81,0.080702,1.000952,-80,-0.920251
3863,values:parents,values:feeling-good,1,75,0.081911,0.941648,-74,-0.859737
6887,liwc2015:relig,liwc2015:risk,6,79,0.687100,1.136309,-73,-0.449209
6803,liwc2015:netspeak,liwc2015:feel,1,74,0.550640,1.003843,-73,-0.453204
5795,liwc2015:anger,liwc2015:female,4,77,0.172471,0.975643,-73,-0.803171
...,...,...,...,...,...,...,...,...
5880,liwc2015:swear,liwc2015:body,80,3,0.504107,0.763071,77,-0.258964
6132,liwc2015:body,liwc2015:sexual,80,2,0.657257,0.788858,78,-0.131600
6889,liwc2015:assent,liwc2015:sexual,78,0,0.554813,0.751274,78,-0.196461
6888,liwc2015:assent,liwc2015:netspeak,80,1,0.591694,0.755847,79,-0.164153


Debugging

In [16]:
relative_ranks[relative_ranks.label_one == 'liwc2015:i']

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
330,liwc2015:i,liwc2015:health,49,76,0.048236,0.870584,-27,-0.822347
331,liwc2015:i,values:parents,43,70,0.033756,0.827608,-27,-0.793852
329,liwc2015:i,liwc2015:female,32,57,0.014581,0.715905,-25,-0.701324
328,liwc2015:i,liwc2015:feel,51,69,0.056762,0.819923,-18,-0.763161
327,liwc2015:i,liwc2015:shehe,7,24,0.001424,0.155960,-17,-0.154535
...,...,...,...,...,...,...,...,...
253,liwc2015:i,values:animals,81,65,1.023022,0.790903,16,0.232119
252,liwc2015:i,liwc2015:netspeak,79,63,0.573845,0.780373,16,-0.206528
251,liwc2015:i,liwc2015:work,52,36,0.063414,0.339661,16,-0.276247
250,liwc2015:i,values:truth,73,56,0.341823,0.698592,17,-0.356770


In [17]:
expanded_relative_ranks[expanded_relative_ranks.label_one == 'liwc2015:i']

Unnamed: 0,label_one,label_two,current_rank,control_rank,current_distance,control_distance,rank_delta,distance_delta
335,liwc2015:i,liwc2015:female,35,62,0.014581,0.715905,-27,-0.701324
334,liwc2015:i,liwc2015:health,54,78,0.048236,0.870584,-24,-0.822347
333,liwc2015:i,liwc2015:shehe,7,27,0.001424,0.155960,-20,-0.154535
332,liwc2015:i,liwc2015:feel,55,73,0.056762,0.819923,-18,-0.763161
331,liwc2015:i,liwc2015:bio,58,74,0.082031,0.826260,-16,-0.744229
...,...,...,...,...,...,...,...,...
256,liwc2015:i,liwc2015:they,32,19,0.012182,0.117329,13,-0.105146
255,liwc2015:i,liwc2015:netspeak,82,68,0.573845,0.780373,14,-0.206528
254,liwc2015:i,liwc2015:posemo,49,34,0.033887,0.267308,15,-0.233420
253,liwc2015:i,liwc2015:pconcern,71,55,0.209119,0.646136,16,-0.437018


In [18]:
org_coverage = compute_lexicon_voc_coverage(corpus_path=cohort_corpus_path, lexicon=Liwc2015)
extended_coverage = compute_lexicon_voc_coverage(corpus_path=cohort_corpus_path, lexicon=Liwc2015Expanded)

f'Original coverage {org_coverage}, Extended coverage {extended_coverage}'

building voc for /home/mbahgat/ws/work/datasets/reddit/archives_2023/submissions_selected/txt/SuicideWatch_submissions.txt: 1630163it [08:41, 3128.21it/s]


TypeError: label() missing 1 required positional argument: 'word'