In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import little_mallet_wrapper as lmw

In [5]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'
output_directory_path = '/Users/maria/Documents/output/birth-control'

<br><br>

# Load datasets (Reddit, WebMD, Twitter)

## Reddit

In [6]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')

In [7]:
len(reddit_posts_df.index), len(reddit_comments_df.index)

(68958, 264912)

In [8]:
reddit_comments_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
160333,4707,fbbthwc,t1_fbbq5g9,1576708000.0,&gt;It was maybe a 6/10 pain for like 3 minute...,gt maybe NUM/NUM pain like NUM minutes tops ex...,iud,2019,12,reddit-comments
77000,3509,dphtk9y,t1_dphs1ci,1510092000.0,That makes me feel so much better. If been on ...,makes feel much better every morning period NU...,implant,2017,11,reddit-comments
88116,2125,dgdkbel,t3_65uv67,1492440000.0,Mine varies but it can spot for about 2 to 5 d...,mine varies spot NUM NUM days,iud,2017,4,reddit-comments


In [9]:
reddit_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,text_type,source
7791,71,3eq1x9,1437962415,My period started on 27 June and last for a w...,"Stopped taking Alesse, then early period/spott...",2015,7,http://www.reddit.com/r/birthcontrol/comments/...,,stopped taking alesse early period/spotting pe...,pill,reddit-posts
50483,1768,enc3t9,1578771691,So I get my birth control pills through Nurx. ...,Switching back and forth?,2020,1,https://www.reddit.com/r/birthcontrol/comments...,Side Effects!?,switching back forth get birth control pills n...,pill,reddit-posts
21645,279,a073sw,1543136367,"I’d been on Yaz for years with no issues, but ...",How to feel better while I wait for new pill?,2018,11,https://www.reddit.com/r/birthcontrol/comments...,Side effects!?,feel better wait new pill d yaz years issues t...,pill,reddit-posts


## WebMD

In [10]:
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')

In [11]:
len(webmd_df.index)

18110

In [12]:
webmd_df.sample(3)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,date,year,text,name,title,source,text_type,num_tokens,tokens_text
11018,11018,12086,12086,w12371,2017-10-26,2017,I was taking Gianvi for over a year when they ...,nikki,,webmd-reviews,pill,76,taking gianvi year switched nikki since starti...
7367,7367,8415,8415,w8610,2015-09-21,2015,This medicine has gave me a pulmonary embolism...,microgestin-fe-1-20,,webmd-reviews,pill,11,medicine gave pulmonary embolism please use
863,863,909,909,w931,2012-08-23,2012,i have been one this pill for a month and have...,azurette,,webmd-reviews,pill,15,one pill month bad break bleeding


## Twitter

In [13]:
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

In [14]:
len(twitter_posts_df.index), len(twitter_replies_df.index)

(499796, 211896)

In [15]:
twitter_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,text_type,num_tokens
19883,22924,twitter-posts,i pray to GOD my iud doesn’t turn on me 😫 http...,pray god iud doesn turn https //t co/NUMryrwloxoi,2019-04-17T20:51:03.000Z,2019,4,1118617892868820993,1118617892868820993,0,0,0,0,iud,8
365406,459945,twitter-posts,Body party doing something to me right about n...,body party something right pops birth control ...,2013-08-26T00:42:36.000Z,2013,8,371794775361331201,371794775361331201,0,1,0,0,pill,8
440021,554181,twitter-posts,contraceptive use :) How to Choose the Right C...,contraceptive use choose right contraceptive p...,2011-08-22T07:02:18.000Z,2011,8,105535248732925952,105535248732925952,0,0,0,0,pill,9


In [16]:
twitter_replies_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,text_type,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens
169206,221807,twitter-replies,get the implanon bar and you'll never have a ...,get implanon bar never period,implant,2014-06-25T03:57:52.000Z,2014,6,481647438646611968,481643759415738368,0,0,2,0,5
122572,159145,twitter-replies,shoot I've got a paraguard. I got 9 yrs on th...,shoot got paraguard got NUM yrs shit,iud,2011-04-09T20:06:29.000Z,2011,4,56810233091198976,56807853620609024,0,0,0,0,7
165047,215400,twitter-replies,Thanks! I am breaking out really bad ever si...,thanks breaking really bad ever since switchin...,implant,2017-05-10T03:28:42.000Z,2017,5,862147357621841922,862144598721806336,0,0,1,0,8


In [17]:
twitter_replies_df['year'].value_counts(normalize=True)

2020    0.243402
2019    0.224950
2018    0.128775
2017    0.089921
2014    0.055131
2012    0.053545
2013    0.052455
2016    0.051322
2015    0.041737
2011    0.034503
2010    0.016895
2009    0.006791
2008    0.000543
2007    0.000028
Name: year, dtype: float64

In [18]:
twitter_posts_df['year'].value_counts(normalize=True)

2012    0.116656
2019    0.109375
2020    0.097354
2015    0.096898
2016    0.093218
2017    0.091539
2013    0.090409
2018    0.089238
2011    0.082324
2014    0.069548
2010    0.049764
2009    0.012667
2008    0.000866
2007    0.000144
Name: year, dtype: float64

## Combine

In [19]:
combined_df = pd.concat([reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df])
len(combined_df)

1063672

In [20]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

<br><br>

# Get basic stats for each dataset

In [18]:
datasets = ['reddit-posts', 'reddit-comments', 'twitter-posts', 'twitter-replies', 'webmd-reviews']

In [19]:
for _dataset in datasets:

    _df = combined_df[combined_df['source'] == _dataset]

    _word_count_dict = defaultdict(int)
    for i, r in _df.iterrows(): 
        for _word in str(r['tokens_text']).split():
            _word_count_dict[_word] += 1

    _num_documents = len(_df.index)
    _vocab_size = len(_word_count_dict.keys())
    _mean_tokens_per_doc = np.mean([len(str(r['tokens_text']).split()) for i, r in _df.iterrows()])

    print(_dataset)
    print('Number of Documents:', _num_documents)
    print('Vocabulary Size:', _vocab_size)
    print('Mean Number of Tokens per Document:', _mean_tokens_per_doc)
    print('==================================')

reddit-posts
Number of Documents: 68958
Vocabulary Size: 49088
Mean Number of Tokens per Document: 79.41001769192842
reddit-comments
Number of Documents: 264912
Vocabulary Size: 67837
Mean Number of Tokens per Document: 31.762543788125868
twitter-posts
Number of Documents: 499796
Vocabulary Size: 398910
Mean Number of Tokens per Document: 12.342963929283147
twitter-replies
Number of Documents: 211896
Vocabulary Size: 73896
Mean Number of Tokens per Document: 12.25912711896402
webmd-reviews
Number of Documents: 18110
Vocabulary Size: 17487
Mean Number of Tokens per Document: 44.84914411927112


<br><br>

# Get sparklines over years

In [20]:
positions = []
for i in range(0, 15):
    _width = 1/15
    positions.append((_width*i) + (_width/2))

In [21]:
years = [2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for _dataset in datasets:

    _df = combined_df[combined_df['source'] == _dataset]
    _total = len(_df.index)

    _heights = []
    for _year in years:
        _heights.append(len(_df[_df['year'] == _year].index) / float(_total))

    _max_height = 0.9
    _adjustor = _max_height / max(_heights)
    _heights = [h*_adjustor for h in _heights]

    print(_dataset)
    print(len(years), len(positions), len(_heights))
    for _position, _height in zip(positions, _heights):
        print('\sparkspike ' + str(_position) + ' ' + str(_height))
    print('=========================================')

reddit-posts
15 15 15
\sparkspike 0.03333333333333333 0.0
\sparkspike 0.1 0.0
\sparkspike 0.16666666666666666 0.0
\sparkspike 0.23333333333333334 0.0
\sparkspike 0.3 0.0
\sparkspike 0.36666666666666664 0.005445744251714401
\sparkspike 0.43333333333333335 0.042839854780153286
\sparkspike 0.5 0.07678499394917306
\sparkspike 0.5666666666666667 0.11308995562726905
\sparkspike 0.6333333333333333 0.15615671641791046
\sparkspike 0.7 0.19763513513513514
\sparkspike 0.7666666666666666 0.3000605082694635
\sparkspike 0.8333333333333334 0.4996470350947963
\sparkspike 0.9 0.8377369907220653
\sparkspike 0.9666666666666667 0.9
reddit-comments
15 15 15
\sparkspike 0.03333333333333333 0.0
\sparkspike 0.1 0.0
\sparkspike 0.16666666666666666 0.0
\sparkspike 0.23333333333333334 0.0
\sparkspike 0.3 0.0
\sparkspike 0.36666666666666664 0.00546811055845873
\sparkspike 0.43333333333333335 0.061820814707548945
\sparkspike 0.5 0.11436284077315369
\sparkspike 0.5666666666666667 0.184824970093811
\sparkspike 0.633