# Characteristic Terms - Intent, Mental Health, and Intent x Mental Health

This notebook takes a random sample of posts from the collected Reddit intention corpus and produces scattertext plots of characteristic terms. Characteristic terms are those that are more frequent in corpus in comparison to the standard English language corpus (loaded from spaCy). In addition, characteristic terms are further identified according to category columns: intent (Seeking advice or Venting) and group (Mental Health or General). 

In [65]:
import pandas as pd
import numpy as np
import scattertext as st
import spacy
from pprint import pprint
import pickle

Load data and models

In [5]:
#load data
MH = pd.read_csv('MH_ss.csv',index_col='id')
advice = pd.read_csv('advice_ss.csv',index_col='id')
vent = pd.read_csv('vent_ss.csv',index_col='id')
gen = pd.concat([advice,vent],sort=True)

Due to computational complexity of generating scattertext corpi, random sample 5,000 vent and 5,000 advice posts.

In [38]:
MH_body = MH.loc[~MH['selftext'].isin(['[removed]','[deleted]'])]

vent_samp = MH_body.loc[MH_body['intent']=='VENT'].sample(5000)
adv_samp = MH_body.loc[MH_body['intent']=='ADVICE'].sample(5000)

MH_samp = pd.concat([vent_samp,adv_samp])


In [20]:
#spaCy model of general English corpus
nlp= spacy.load('en')

### Mental Health corpus - Characteristic Terms and Scattertext Plot

In [39]:
#characteristic terms using scattertext
#for more than 2 categories, do n 1 vs n-1 plots

#turning dfs into Scattertext Corpus 
MH_corpus = st.CorpusFromPandas(MH_samp,
                                category_col='intent',
                                text_col='selftext',
                                nlp=nlp
                               ).build()



In [27]:
#characteristic terms that differentiate corpus from general English
print(list(MH_corpus.get_scaled_f_scores_vs_background().index[:10]))



['bpd', 'idk', 'anxiety', 'reddit', 'anxious', 'gon', 'texted', 'instagram', 'cptsd', 'texting']


In [28]:
MH_term_freq_df = MH_corpus.get_term_freq_df()
MH_term_freq_df['Advice Score']= MH_corpus.get_scaled_f_scores('ADVICE')
MH_term_freq_df['Vent Score']= MH_corpus.get_scaled_f_scores('VENT')

pprint(list(MH_term_freq_df.sort_values(by='Advice Score',
                                        ascending=False).index[:10]))

pprint(list(MH_term_freq_df.sort_values(by='Vent Score',
                                        ascending=False).index[:10]))

['advice',
 'any advice',
 'panic',
 'depression',
 'any',
 'help',
 'do i',
 'anxiety',
 'how to',
 'have been']
['fucking',
 'fuck',
 'wish',
 'tired',
 'hate',
 'i hate',
 'shit',
 'enough',
 'everyone',
 'i wish']


In [40]:
html = st.produce_scattertext_explorer(MH_corpus,
          category='ADVICE',
          category_name='advice',
          not_category_name='vent',
          minimum_term_frequency=20,                             
          width_in_pixels=1000)

open("MH-Term-Visualization_10000.html", 'wb').write(html.encode('utf-8'))

13989244

### General corpus - Characteristic Terms and Scattertext Plot

Random sample of 5,000 advice posts and 5,000 vent posts.

In [41]:
gen_body = gen.loc[~gen['selftext'].isin(['[removed]','[deleted]'])]
vent_samp = gen_body.loc[gen_body['intent']=='VENT'].sample(5000)
adv_samp = gen_body.loc[gen_body['intent']=='ADVICE'].sample(5000)
gen_samp = pd.concat([vent_samp,adv_samp])

In [42]:
#characteristic terms using scattertext
#for more than 2 categories, do n 1 vs n-1 plots

#turning dfs into Scattertext Corpus 
gen_corpus = st.CorpusFromPandas(gen_samp,
                                category_col='intent',
                                text_col='selftext',
                                nlp=nlp
                               ).build()



In [44]:
html = st.produce_scattertext_explorer(gen_corpus,
          category='ADVICE',
          category_name='Advice',
          not_category_name='Vent',
          minimum_term_frequency=30,                                                                    
          width_in_pixels=1000)

open("General-Intent-Term-Visualization_10000.html", 'wb').write(html.encode('utf-8'))

12784539

### Use custom coordinates to plot the intent scaled f-score vs. mental health scaled f-score

In [None]:
#single corpus, put MH and general merged.. new col, MH general
#apply f-scores of different corpus to x_coords, y_coords

In [45]:
gen_samp['group'] = 'GENERAL'
MH_samp['group'] = 'MENTAL HEALTH'
merged_samp = pd.concat([gen_samp,MH_samp],sort=True)

In [46]:
merged_corpus_intent = st.CorpusFromPandas(merged_samp,
                                category_col='intent',
                                text_col='selftext',
                                nlp=nlp
                               ).build()

In [47]:
merged_corpus_group = st.CorpusFromPandas(merged_samp,
                                category_col='group',
                                text_col='selftext',
                                nlp=nlp
                               ).build()

In [48]:
#get f-scores
advice_scores = merged_corpus_intent.get_scaled_f_scores('ADVICE')
health_scores = merged_corpus_group.get_scaled_f_scores('MENTAL HEALTH')

In [64]:
html = st.produce_scattertext_explorer(merged_corpus_intent,
                                       category='ADVICE',
                                       category_name='Advice',
                                       not_category_name='Venting',
                                       minimum_term_frequency=30,
                                       pmi_filter_thresold=4,
                                       width_in_pixels=1000,
                                       scores=advice_scores,
                                       sort_by_dist=False,
                                       x_coords=health_scores,
                                       y_coords=advice_scores,
                                       show_characteristic=False,
                                       metadata=(merged_corpus_intent.get_df()['group']+' ('+merged_corpus_intent.get_df()['subreddit']  + ')'),
                                       x_label='More Mental-Health related',
                                       y_label='More Advice-Seeking')
file_name = 'Intent_Group.html'
open(file_name, 'wb').write(html.encode('utf-8'))

25624332

### Pickle corpus objects

In [66]:
with open('corpus_intent_10k.pickle', 'wb') as f:
    pickle.dump(merged_corpus_intent, f)
    
with open('corpus_group_10k.pickle', 'wb') as f:
    pickle.dump(merged_corpus_group, f)


In [67]:
with open('corpus_MH_5k.pickle', 'wb') as f:
    pickle.dump(MH_corpus, f)
    
with open('corpus_gen_5k.pickle', 'wb') as f:
    pickle.dump(gen_corpus, f)
