In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import little_mallet_wrapper as lmw

In [2]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'
output_directory_path = '/Users/maria/Documents/output/birth-control'

<br><br>

# Load datasets (Reddit, WebMD, Twitter)

In [3]:
combined_df = pd.read_csv(data_directory_path + '/combined_df.side_effects.csv')

  combined_df = pd.read_csv(data_directory_path + '/combined_df.side_effects.csv')


In [4]:
len(combined_df.index)

1063672

In [5]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

In [6]:
combined_df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0', 'id', 'created_utc', 'text', 'title',
       'year', 'month', 'url', 'link_flair_text', 'tokens_text', 'text_type',
       'source', 'parent_id', 'date', 'conversation_id', 'retweet_count',
       'reply_count', 'like_count', 'quote_count', 'num_tokens',
       'Unnamed: 0.1', 'Unnamed: 0.1.1', 'name', 'any side effect', 'libido',
       'appetite', 'mood', 'pms', 'no period', 'dryness', 'cramps', 'bloating',
       'felt strings', 'dizziness', 'breasts', 'stroke', 'weight gain', 'pain',
       'general', 'headache', 'hair loss', 'fatigue', 'nausea', 'discharge',
       'heart attack', 'infection', 'blood pressure', 'skin', 'bleeding'],
      dtype='object')

<br><br>

# **Get data for method labeling**

In [60]:
df_to_label = combined_df.groupby(['text_type', 'source']).sample(50) 

In [61]:
len(df_to_label.index)

750

In [62]:
df_to_label = df_to_label[['id', 'text', 'title', 'text_type', 'source']]

In [63]:
df_to_label

Unnamed: 0,id,text,title,text_type,source
209783,e6414z8,Yes yes you’re not alone... I’ve been bleeding...,,implant,reddit-comments
186543,duk95hn,That article is referring to Implanon which is...,,implant,reddit-comments
115195,cyl55u2,"Yep! We usually suggest the non-dominant, but ...",,implant,reddit-comments
278174,fekp3e6,I'm 19 as well and got mine a week ago. It fel...,,implant,reddit-comments
283717,g9iz3w1,The implant is pretty much the best tolerated ...,,implant,reddit-comments
...,...,...,...,...,...
1049915,w5440,Have been taking Junel for three months. Besid...,,pill,webmd-reviews
1060714,w17450,I am on my third month and I'm considering sto...,,pill,webmd-reviews
1051378,w7007,"Before I started Loestrin 24 Fe, I was taking ...",,pill,webmd-reviews
1050742,w6354,I have been taking these pills for almost a ye...,,pill,webmd-reviews


In [64]:
df_to_label.to_csv(output_directory_path + '/validation.methods.to_label.csv')

<br><br><br><br>

# **Get data for side effects labeling**

In [7]:
side_effects = ['any side effect', 'libido',
                'appetite', 'mood', 'pms', 'no period', 'dryness', 'cramps', 'bloating',
                'felt strings', 'dizziness', 'breasts', 'stroke', 'weight gain', 'pain',
                'general', 'headache', 'hair loss', 'fatigue', 'nausea', 'discharge',
                'heart attack', 'infection', 'blood pressure', 'skin', 'bleeding']

In [41]:
dicts_to_label = []
for _source in combined_df['source'].unique():
    for _side_effect in side_effects:

        try:
            _sampled_df = combined_df[(combined_df['source'] == _source) & (combined_df[_side_effect] == 1)].sample(4)
            for i, r in _sampled_df.iterrows():
                _text = ''
                if not pd.isnull(r['title']):
                    _text = '[TITLE: ' + r['title'] + '] '
                _text += r['text']
                dicts_to_label.append({'id': r['id'],
                                       'source': r['source'],
                                       'text': _text,
                                       'side_effect': _side_effect + '_YES'})

            _sampled_df = combined_df[(combined_df['source'] == _source) & (combined_df[_side_effect] == 0)].sample(4)
            for i, r in _sampled_df.iterrows():
                _text = ''
                if not pd.isnull(r['title']):
                    _text = '[TITLE: ' + r['title'] + '] '
                _text += r['text']
                dicts_to_label.append({'id': r['id'],
                                       'source': r['source'],
                                       'text': _text,
                                       'side_effect': _side_effect + '_NO'})

        except ValueError:
            _sampled_df = combined_df[(combined_df['source'] == _source) & (combined_df[_side_effect] == 1)].sample(4)
            for i, r in _sampled_df.iterrows():
                _text = ''
                if not pd.isnull(r['title']):
                    _text = '[TITLE: ' + r['title'] + '] '
                _text += r['text']
                dicts_to_label.append({'id': r['id'],
                                       'source': r['source'],
                                       'text': _text,
                                       'side_effect': _side_effect + '_YES'})

            _sampled_df = combined_df[(combined_df['source'] == _source) & (combined_df[_side_effect] == 0)].sample(4)
            for i, r in _sampled_df.iterrows():
                _text = ''
                if not pd.isnull(r['title']):
                    _text = '[TITLE: ' + r['title'] + '] '
                _text += r['text']
                dicts_to_label.append({'id': r['id'],
                                       'source': r['source'],
                                       'text': _text,
                                       'side_effect': _side_effect + '_NO'})
            print(_source, _side_effect, len(combined_df[(combined_df['source'] == _source) & (combined_df[_side_effect] == 1)]))

len(dicts_to_label)

1040

In [42]:
df_to_label = pd.DataFrame(dicts_to_label)

In [43]:
df_to_label.sample(3)

Unnamed: 0,id,source,text,side_effect
418,1197641105010876416,twitter-posts,Gynaecologists debunk myths surrounding the co...,any side effect_YES
999,w10974,webmd-reviews,I had Mirena for 1 month. I was crabby and my...,discharge_NO
133,7plwst,reddit-posts,[TITLE: iud help???] i got a mirena iud insert...,headache_NO


In [44]:
df_to_label['side_effect'].value_counts()

any side effect_YES    20
any side effect_NO     20
pain_YES               20
pain_NO                20
general_YES            20
general_NO             20
headache_YES           20
headache_NO            20
hair loss_YES          20
hair loss_NO           20
fatigue_YES            20
fatigue_NO             20
nausea_YES             20
nausea_NO              20
discharge_YES          20
discharge_NO           20
heart attack_YES       20
heart attack_NO        20
infection_YES          20
infection_NO           20
blood pressure_YES     20
blood pressure_NO      20
skin_YES               20
skin_NO                20
bleeding_YES           20
weight gain_NO         20
weight gain_YES        20
stroke_NO              20
dryness_YES            20
libido_YES             20
libido_NO              20
appetite_YES           20
appetite_NO            20
mood_YES               20
mood_NO                20
pms_YES                20
pms_NO                 20
no period_YES          20
no period_NO

In [45]:
df_to_label.to_csv(output_directory_path + '/validation.side_effects.to_label.csv')

In [46]:
len(df_to_label.index)

1040