In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import little_mallet_wrapper as lmw

In [2]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

<br><br>

# Load datasets (Reddit, WebMD, Twitter)

## Reddit

In [3]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')

In [4]:
len(reddit_posts_df.index), len(reddit_comments_df.index)

(68958, 264912)

In [5]:
reddit_comments_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
18107,1162,cf6az8m,t3_1wxgxa,1391470000.0,Theres no way to tell what you can expect. Eve...,theres way tell expect every woman experiences...,pill,2014,2,reddit-comments
224128,434,flvg7a7,t3_frd0p5,1585523000.0,I can see how the sexual thoughts could be cau...,see sexual thoughts could caused birth control...,pill,2020,3,reddit-comments
140902,3232,e62jg5q,t1_e621rv1,1537101000.0,You don't have a cycle or ovulation on the pil...,cycle ovulation pill could ovulate first misse...,pill,2018,9,reddit-comments


In [6]:
reddit_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,text_type,source
21196,1002,9mgbun,1539015716,I've been seeing hair loss and thinning with b...,Taking biotin with birth control?,2018,10,https://www.reddit.com/r/birthcontrol/comments...,,taking biotin birth control seeing hair loss t...,pill,reddit-posts
3134,223,2ij9gn,1412663685,My gf has been on bc and had unprotected sex y...,I'm a guy who need clearing up about birth con...,2014,10,http://www.reddit.com/r/birthcontrol/comments/...,,m guy need clearing birth control gf bc unprot...,pill,reddit-posts
59613,1298,gk4l0i,1589527656,I love that my skin is incredibly easy to mana...,Has anyone had success switching from the comb...,2020,5,https://www.reddit.com/r/birthcontrol/comments...,Side effects!?,anyone success switching combination pill patc...,pill,reddit-posts


## WebMD

In [7]:
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')

In [8]:
len(webmd_df.index)

18110

In [9]:
webmd_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,date,year,text,name,title,source,text_type,num_tokens,tokens_text
278,278,281,281,w286,2012-05-27,2012,I'm taking this for multiple reasons. Started ...,amethia,,webmd-reviews,pill,87,m taking multiple reasons started seasonique f...
5177,5177,6205,6205,w6351,2008-12-23,2008,I had no side effects at all. I went from hav...,levora-15-30-28,,webmd-reviews,pill,52,side effects went NUM day periods NUM days als...
5298,5298,6327,6327,w6476,2019-04-23,2019,"I have noticed vaginal inflammation, yeast inf...",lo-loestrin-fe,,webmd-reviews,pill,14,noticed vaginal inflammation yeast infection b...


## Twitter

In [10]:
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

In [11]:
len(twitter_posts_df.index), len(twitter_replies_df.index)

(499796, 211896)

In [12]:
twitter_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,text_type,num_tokens
389503,491522,twitter-posts,“: he said i should be a birth control pill O_...,said birth control pill o o,2012-10-23T04:20:29.000Z,2012,10,260596533370699776,260596533370699776,1,0,0,0,pill,6
232696,293301,twitter-posts,10.13.14 Im Getn My Implanon Removed && Im Get...,NUM NUM NUM im getn implanon removed im getn p...,2011-11-18T18:49:52.000Z,2011,11,137603446382661632,137603446382661632,0,0,0,0,implant,14
221367,277349,twitter-posts,Blackhat Money Making Rapid Rapid New Birth Co...,blackhat money making rapid rapid new birth co...,2014-05-15T03:11:36.000Z,2014,5,466777893075288065,466777893075288065,0,0,0,0,implant,13


In [13]:
twitter_replies_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,text_type,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens
69199,89688,twitter-replies,Fully aware since 09 no 07/08 b Jen...,fully aware since NUM NUM/NUM jennifer see don...,iud,2017-10-20T23:37:56.000Z,2017,10,921520894606041088,921367521395175426,0,1,0,0,19
104903,135666,twitter-replies,👀 RT “: LOL! Got my IUD so my year accounted f...,lol got iud year accounted highmostlyhigh use ...,iud,2014-01-03T02:38:30.000Z,2014,1,418934365964738560,418909871128707072,1,1,0,0,9
40614,52593,twitter-replies,That is an interesting fact -- in response t...,interesting fact response trump elected number...,iud,2019-02-09T02:49:34.000Z,2019,2,1094065743707160583,1094033890354839553,0,0,1,0,26


<br><br><br><br>

# Combine into one dataframe

In [14]:
combined_df = pd.concat([reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df])
len(combined_df)

1063672

In [15]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

<br><br><br><br>

# Lexicon brainstorming

In [16]:
# random_sample_of_texts = []

# for i, r in combined_df[combined_df['source'] == 'twitter-replies'].sample(10000).iterrows():
#     random_sample_of_texts.append(' '.join(r['text'].split()))

# for i, r in combined_df[combined_df['source'] == 'twitter-posts'].sample(10000).iterrows():
#     random_sample_of_texts.append(' '.join(r['text'].split()))

# for i, r in combined_df[combined_df['source'] == 'reddit-comments'].sample(10000).iterrows():
#     random_sample_of_texts.append(' '.join(r['text'].split()))

# for i, r in combined_df[combined_df['source'] == 'reddit-posts'].sample(10000).iterrows():
#     random_sample_of_texts.append(' '.join(r['text'].split()))

# for i, r in combined_df[combined_df['source'] == 'webmd-reviews'].sample(10000).iterrows():
#     random_sample_of_texts.append(' '.join(r['text'].split()))

In [17]:
# target = 'bald'

# matches = []
# for _text in random_sample_of_texts:
#     if target in _text.lower():
#         matches.append(_text)
# len(matches)

# # category = 'nausea'

# # matches = []
# # for _text in random_sample_of_texts:
# #     _match = False
# #     for _pattern in category_patterns_dict[category]:
# #         if len(re.findall(_pattern, _text.lower())) > 0:
# #             _match = True
# #     if _match:
# #         matches.append(_text)
# # len(matches)

In [18]:
# for m in random.sample(matches, 5):
#     print(m.lower().replace(target, '______' + str(target.upper()) + '______'))
#     print()

# # for m in random.sample(matches, 10):
# #     print(m)
# #     print()

# Define the lexicon

In [19]:
category_patterns_dict = {'breast sensitivity': [r'(breast(?:s)?|boob(?:s)?) (hurt|(is|are) (sore|sensitive|tender))',
                                                 r'soreness of (breast(?:s)?|boob(?:s)?)',
                                                 r'(tender|sore|sensitive) (breast(?:s)?|boob(?:s)?)',
                                                 r'(breast(?:s)?|boob(?:s)?) pain'],
                        'nausea': ['nausea', 
                                   'naseuous', 
                                   'sick to my stomach', 
                                   'queasy', 
                                   'vomit', 
                                   'throw up', 
                                   'puke',
                                   'puking'],
                        'skin conditions': ['skin is clearing up', 
                                            'breaking out',
                                            'broke out',
                                            'break out',
                                            'acne', 
                                            'pimples', 
                                            'melasma'], 
                        'menstrual bleeding': ['spotting',
                                               'breakthrough bleed',
                                               'bleed nonstop',
                                               'period nonstop',
                                               'nonstop period',
                                               'bled for six months',
                                               'heavy period',
                                               r'period(?:s)? (became|is|are) (regular|heavy|light|irregular)',
                                               'missed period',
                                               'missed my period'
                                               'skipped period',
                                               'skipped my period',
                                               'no period',
                                               'bleed',
                                               'heavy period',
                                               'light period'], 
                        'weight & appetite': [r'(gain(?:ed)?|lost|lose) a (couple|few) pounds', 
                                              r'(gain(?:ed)?|lost|lose) weight',
                                              r'(gain(?:ed)?|lost|lose) [a-z0-9]+ (pounds|kilograms|lb|lbs|kg|stone)'
                                              'appetite',
                                              'craving',
                                              'hungry'],
                        'mental health': ['anxiety', 
                                          'mood', 
                                          'depression', 
                                          'depressed', 
                                          'anxious', 
                                          'anxiety',
                                          'mental health', 
                                          'panic attack', 
                                          'irritable', 
                                          'irritability',
                                          'emotional'], 
                        'sex drive': ['sex drive', 
                                      'sexual drive',
                                      'libido'],
                        'bloating': ['bloat',
                                     'water retention'],
                        'sleep': ['sleepy', 
                                  'can\'t sleep', 
                                  'tired all the time', 
                                  'always feel tired', 
                                  'exhausted',
                                  'fatigue', 
                                  'always tired'], 
                        'pms': ['pms'],
                        'hair loss': ['hair loss',
                                      'hair fell',
                                      'hair thinning',
                                      'bald',
                                      'hair fall',
                                      'hair shed'],
                        'headache': ['headache', 
                                     'head hurt',
                                     'head ache'],
                        'migraine': ['migraine'],
                        'infection': ['infection', 
                                      'uti'], # careful, need to search for token not any string containing (maybe do this for any target with fewer than X characters)
                        'severe effects': ['blood clot', 
                                           'pulmonary embolism', 
                                           'stroke',
                                           'heart attack'],
                        'vaginal discharge': ['discharge'],
                        'dryness': ['lubricat', 'dryness', 'wet'],
                        'pain': ['painful', 
                                 'pain', 
                                 'hurt',
                                 'agony',
                                 'cramp',
                                 'throb',
                                 'stabbing',
                                 'stabbed',
                                 'ache'], # will include headache but that's ok? should we also include 'tender', 'sensitive' which will include breast sensitivity?
                        'partner felt strings': [r'(partner|husband|boyfriend|bf|he|they) (could feel|felt) the strings'],
                        'general side effects': ['side effect']}

# Find and save all the matches

In [20]:
for _category, _patterns in category_patterns_dict.items():

    # if _category == 'breast sensitivity':

    print(str(datetime.now()) + ' ' + _category)

    _matched_ids = []

    for i, r in combined_df.iterrows():

        _text = str(r['text']).lower()
        if not pd.isnull(r['title']):
            _text += ' ' + str(r['title'])

        _match = False
        for _pattern in _patterns:
            if len(_pattern) >= 4:
                if len(re.findall(_pattern, _text.lower())) > 0:
                    _match = True
            else:
                if _pattern in _text.lower().replace('.', ' ').replace('!', ' ').replace('?', ' ' ).replace(',', ' ').replace(';', ' ').split():
                    _match = True
        if _match:
            _matched_ids.append(r['id'])

    print(str(datetime.now()) + ' ' + str(len(_matched_ids)))

    _output_file = open(output_directory_path + '/lexicon-matches/' + '_'.join(_category.split()) + '.txt', 'w')
    for _id in _matched_ids:
        _output_file.write(str(_id) + '\n')
    _output_file.close()

2022-01-04 16:56:41.650754 breast sensitivity
2022-01-04 16:57:14.656464 2715
2022-01-04 16:57:14.998437 nausea
2022-01-04 16:57:49.447805 12340
2022-01-04 16:57:49.452089 skin conditions
2022-01-04 16:58:22.574520 27497
2022-01-04 16:58:22.583279 menstrual bleeding
2022-01-04 16:59:04.755748 91666
2022-01-04 16:59:04.780407 weight & appetite
2022-01-04 16:59:41.868242 9827
2022-01-04 16:59:41.871880 mental health
2022-01-04 17:00:19.878812 57803
2022-01-04 17:00:19.895437 sex drive
2022-01-04 17:00:48.919749 15948
2022-01-04 17:00:48.925138 bloating
2022-01-04 17:01:16.581393 7236
2022-01-04 17:01:16.583999 sleep
2022-01-04 17:01:50.263464 3910
2022-01-04 17:01:50.265473 pms
2022-01-04 17:02:19.353392 5740
2022-01-04 17:02:19.361351 hair loss
2022-01-04 17:02:51.495278 3387
2022-01-04 17:02:51.496930 headache
2022-01-04 17:03:20.104164 7786
2022-01-04 17:03:20.107409 migraine
2022-01-04 17:03:46.538905 7971
2022-01-04 17:03:46.542356 infection
2022-01-04 17:04:17.763222 7914
2022-01-0

# Print a sample of matches for examination

In [21]:
target_side_effect = 'weight & appetite'

target_ids = [l.strip() for l in open(output_directory_path + '/lexicon-matches/' + '_'.join(target_side_effect.split()) + '.txt', 'r')]
target_df = combined_df[combined_df['id'].isin(target_ids)]

print(len(target_ids))

for i, r in target_df.sample(20).iterrows():
    _text = ''
    if not pd.isnull(r['title']):
        _text += r['title']
    _text += ' ' + ' '.join(r['text'].split())
    _text = _text.lower()
    for _pattern in category_patterns_dict[target_side_effect]:
        _text = re.sub(_pattern, '----------' + _pattern + '----------', _text)
    _text
    print(_text)
    print()

# made me dry
# get dry, got dry
# vaginal dryness
# dryness?

9827
 you lasted longer than me - i had a copper coil inserted in march and removed in may. i had 0 periods on the pill but was fed up of the nausea, bloating, inability to ----------(gain(?:ed)?|lost|lose) weight----------. with the coil i bled for 15+ days at a time and the mess, the smell, having to change a maxi ultra pad every hour - it got me down. i started taking the pill again a week before removal. i do a pregnancy test every month and i don't bleed at all. happy days! i don't recommend the coil.

 i had kylena inserted about a month and a half ago. the first three weeks i had extremely bad pain randomly through out the day and night, to the point that it made me cry and regret it. i thought it was tearing my uterus because i only felt it it in the left side. i called pp and they said it was normal. i was also spotting, until now. my skin got oily the first couple of weeks and i got a couple huge pimples in my legs. my mood is changing a lot!! either i am angry, anxious, cryi

# Create CSV for labeling

In [50]:
category_matches_dict = defaultdict(list)
for _file_name in os.listdir(output_directory_path + '/lexicon-matches'):
    if _file_name.endswith('.txt'):
        _category = ' '.join(_file_name.replace('.txt', '').split('_')).strip()
        print(_file_name)
        for _line in open(output_directory_path + '/lexicon-matches/' + _file_name, 'r'):
            if _line.strip():
                category_matches_dict[_category].append(_line.strip())
len(category_matches_dict)

bloating.txt
breast_sensitivity.txt
dryness.txt
general_side_effects.txt
hair_loss.txt
headache.txt
infection.txt
menstrual_bleeding.txt
mental_health.txt
migraine.txt
nausea.txt
pain.txt
partner_felt_strings.txt
pms.txt
severe_effects.txt
sex_drive.txt
skin_conditions.txt
sleep.txt
vaginal_discharge.txt
weight_&_appetite.txt


20

In [51]:
id_data_dict = {}
for i, r in combined_df.iterrows():
    # id_text_dict[str(r['id'])] = ' '.join(r['text'].split())
    # if not pd.isnull(r['title']):
    #     id_text_dict[str(r['id'])] += '\n' + r['title']
    id_data_dict[str(r['id'])] = r
len(id_data_dict)

1078577

In [52]:
dicts_to_label = []
for _category, _matches in category_matches_dict.items():
    for _id in random.sample(_matches, 10):
        _text = ' '.join(id_data_dict[_id]['text'].split())
        if not pd.isnull(id_data_dict[_id]['title']):
            _text = ' '.join(str(id_data_dict[_id]['title']).split()) + ' ' + _text
        dicts_to_label.append({'Side Effect': _category,
                               'Matched': 'yes',
                               'Source': id_data_dict[_id]['source'],
                               'ID': str(_id),
                               'Text': _text})
len(dicts_to_label)

200

In [53]:
all_ids = random.sample(list(id_data_dict.keys()), 10000)
for _category, _matches in category_matches_dict.items():
    print(_category)
    _unmatched_ids = [_id for _id in all_ids if _id not in _matches]
    for _id in random.sample(_unmatched_ids, 10):
        _text = ' '.join(id_data_dict[_id]['text'].split())
        if not pd.isnull(id_data_dict[_id]['title']):
            _text = ' '.join(str(id_data_dict[_id]['title']).split()) + ' ' + _text
        dicts_to_label.append({'Side Effect': _category,
                               'Matched': 'no',
                               'Source': id_data_dict[_id]['source'],
                               'ID': str(_id),
                               'Text': _text})
len(dicts_to_label)

bloating
breast sensitivity
dryness
general side effects
hair loss
headache
infection
menstrual bleeding
mental health
migraine
nausea
pain
partner felt strings
pms
severe effects
sex drive
skin conditions
sleep
vaginal discharge
weight & appetite


400

In [54]:
df_to_label = pd.DataFrame(dicts_to_label)
df_to_label.sample(5)

Unnamed: 0,Side Effect,Matched,Source,ID,Text
111,pain,yes,twitter-posts,903408502911115266,Hi I just got my old IUD removed and a new one...
39,general side effects,yes,twitter-posts,822339655693729792,"Really tho, should I get an IUD? What are the ..."
61,infection,yes,twitter-replies,127003196618047488,"""can't stds crawl up the SPINE of the IUD and ..."
207,bloating,no,twitter-posts,650464160371843076,I'm playing WordBrain and I've just reached br...
136,pms,yes,webmd-reviews,w5245,Im into my second pack 1st month period came a...


In [55]:
df_to_label.to_csv(output_directory_path + '/lexicon_matches_to_label.csv')