In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import little_mallet_wrapper as lmw

In [2]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

## Load data

In [3]:
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')

In [4]:
len(webmd_df.index)

18110

In [5]:
webmd_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,date,year,text,name,title,source,text_type,num_tokens,tokens_text
7166,7166,8201,8201,w8392,2009-05-14,2009,I went on Lybrel so that I could stop getting ...,lybrel-28-tablet,,webmd-reviews,pill,47,went lybrel could stop getting painful periods...
15393,15393,17265,17265,w17703,2015-09-11,2015,I just started taking this medicine 3 weeks ag...,tri-legest-fe,,webmd-reviews,pill,71,started taking medicine NUM weeks ago NUMst se...
13344,13344,15108,15108,w15477,2009-01-27,2009,I have had the strangest and most unpleasant e...,quasense,,webmd-reviews,pill,121,strangest unpleasant experience drug started u...


<br><br><br><br>

# Get list of medication names for Reddit

In [19]:
pd.set_option("display.max_rows", None)
names = []
for _name, _count in webmd_df[webmd_df['text_type'] == 'pill']['name'].value_counts().iteritems():
    names.append(re.split(r'(\d+)', _name)[0].strip('-.'))
# ', '.join(["'" + n + "'" for n in names])
', '.join(sorted(names))

'alesse, alesse, altavera, alyacen, amethia, amethia-lo, amethyst, apri, ashlyna, aubra, aviane, azurette, balziva, beyaz, blisovi, blisovi-fe, camila, camrese, camrese-lo, caziant, chateal, cryselle, cyclafem, cyclessa, dasetta, dasetta-triphasic, daysee, demulen, desogen-tablet, desogestrel-ethinyl-estradiol, emoquette, enpresse, enskyce, errin, estarylla, estrostep-fe, falmina, femcon-fe-tablet-chewable, generess-fe, gianvi, gildess, gildess-fe, heather, introvale, isibloom, jolessa, jolivette, junel, junel, junel-fe, junel-fe, kariva, kelnor, kurvelo, larin-fe, larissia, leena, lessina, levlen, levonorgestrel-ec, levora, levora, levora, lo-loestrin-fe, lo-ovral, loestrin, loestrin, loestrin, loestrin-fe, loestrin-fe, lomedia, loryna, loseasonique, low-ogestrel, low-ogestrel, lutera, lybrel, lyza, marlissa, microgestin, microgestin, microgestin-fe, microgestin-fe, micronor, minastrin, mircette, mono-linyah, mononessa, myzilra, natazia, necon, necon, necon, necon, necon-triphasic, ni

In [7]:
webmd_df[webmd_df['text_type'] == 'iud']['name'].value_counts()

mirena-device                1711
paragard-t-380-a-device       503
skyla-device                  116
liletta-device                 14
levonorgestrel-device-3yr      10
Name: name, dtype: int64

In [8]:
webmd_df[webmd_df['text_type'] == 'implant']['name'].value_counts()

implanon-implant       459
nexplanon-implant      405
norplant-system-kit     19
Name: name, dtype: int64

## Rename columns, add source column

In [7]:
webmd_df = webmd_df.rename(columns={'bc_type': 'text_type'})
webmd_df['source'] = 'webmd-reviews'

## Remove short reviews

In [8]:
def get_num_tokens(text):
    if pd.isnull(text):
        return 0
    return len(text.split())

webmd_df['num_tokens'] = webmd_df['text'].apply(get_num_tokens)
webmd_df = webmd_df[(webmd_df['num_tokens'] >= 3)]
len(webmd_df.index)

18110

## Remove duplicate reviews

In [9]:
webmd_df = webmd_df.drop_duplicates(subset='text')
len(webmd_df)

18110

## Drop reviews not in our three target bc types

In [10]:
webmd_df = webmd_df[webmd_df['text_type'].isin(['pill', 'iud', 'implant'])]
len(webmd_df.index)

18110

## Add tokenized text

In [11]:
def get_tokens(r):
    return lmw.process_string(str(r['text']), remove_short_words=False)

In [12]:
webmd_df['text'] = webmd_df['text'].astype(str)
webmd_df['tokens_text'] = webmd_df.apply(get_tokens, axis=1)

## Final dataframe

In [13]:
len(webmd_df.index)

18110

In [14]:
webmd_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,date,year,text,name,title,source,text_type,num_tokens,tokens_text
10949,12017,12017,w12301,2013-05-08,2013,I have been on nexplanon for a month now. At ...,nexplanon-implant,,webmd-reviews,implant,87,nexplanon month first ok got period NUM days g...
5366,6396,6396,w6545,2015-05-20,2015,After starting this drug I have been paranoid ...,lo-loestrin-fe,,webmd-reviews,pill,82,starting drug paranoid pregnant ever since mak...
11080,12149,12149,w12434,2014-12-13,2014,I took Nora Be for 6 months. This pill is horr...,nora-be,,webmd-reviews,pill,184,took nora NUM months pill horrible doctor pres...


In [15]:
webmd_df['year'].value_counts()

2012    2475
2011    2410
2009    2263
2010    2098
2013    1885
2014    1628
2008    1549
2015    1077
2016     951
2017     495
2018     395
2019     348
2007     347
2020     189
Name: year, dtype: int64

In [16]:
webmd_df.to_csv(data_directory_path + '/final-data/webmd.csv')