In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import little_mallet_wrapper as lmw

In [2]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

<br><br>

# Load datasets (Reddit, WebMD, Twitter)

## Reddit

In [3]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')

In [4]:
len(reddit_posts_df.index), len(reddit_comments_df.index)

(68958, 264912)

In [5]:
reddit_comments_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
214409,3361,g9ov871,t3_jg79ta,1603401000.0,"It's progesterone only so no, it shouldn't. It...",progesterone shouldn estrogen main culprit thi...,pill,2020,10,reddit-comments
38669,1727,crz41jy,t3_38zj5w,1433736000.0,If you are taking the pill correctly you are p...,taking pill correctly protected time even pill...,pill,2015,6,reddit-comments
24836,69,ckv7eok,t1_ckv63rd,1411955000.0,You can take a pregnancy test as soon as the p...,take pregnancy test soon period missed since o...,pill,2014,9,reddit-comments


In [6]:
reddit_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,text_type,source
60639,337,hg8n4w,1593181868,"Hi guys, so I’ve been on the combination pill ...",No withdrawal bleeding for months??,2020,6,https://www.reddit.com/r/birthcontrol/comments...,Mistake or Risk?,withdrawal bleeding months hi guys combination...,pill,reddit-posts
43559,1437,bz3a40,1560200575,"I got my period on May 22, and then exactly 2 ...",Normal for birth control or pregnant?,2019,6,https://www.reddit.com/r/birthcontrol/comments...,,normal birth control pregnant got period may N...,pill,reddit-posts
59523,1176,gl4k2r,1589671160,I’ve been on junel for almost a year. Me and m...,Dark red blood and skin bits but still have 2 ...,2020,5,https://www.reddit.com/r/birthcontrol/comments...,Mistake or Risk?,dark red blood skin bits still NUM days pills ...,pill,reddit-posts


## WebMD

In [7]:
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')

In [8]:
len(webmd_df.index)

18110

In [9]:
webmd_df.sample(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,date,year,text,name,title,source,text_type,num_tokens,tokens_text
11720,11720,13373,13373,w13690,2009-04-22,2009,My pharmacy switched my prescription to the ge...,ocella,,webmd-reviews,pill,143,pharmacy switched prescription generic form ya...
14246,14246,16103,16103,w16501,2013-11-13,2013,I had been taking Sprintec for several months ...,sprintec,,webmd-reviews,pill,73,taking sprintec several months everything seem...
3713,3713,4679,4679,w4788,2010-03-02,2010,I have had the Implanon in since April 2007 an...,implanon-implant,,webmd-reviews,implant,141,implanon since april NUM get removed worked NU...


## Twitter

In [10]:
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

In [11]:
len(twitter_posts_df.index), len(twitter_replies_df.index)

(499796, 211896)

In [12]:
twitter_posts_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,text_type,num_tokens
245973,308920,twitter-posts,who has the nexplanon???? does it make y’all c...,nexplanon make y cry reason bc m sitting balli...,2019-03-02T08:51:34.000Z,2019,3,1101766989733863425,1101766989733863425,0,0,0,0,implant,12
176483,222263,twitter-posts,The worst thing about Hulu? EASY. I have the P...,worst thing hulu easy paragard song stuck head...,2020-07-07T23:11:25.000Z,2020,7,1280640595644551168,1280640595644551168,0,1,0,0,iud,9
177025,222839,twitter-posts,IUD Use Tied to Modest Weight Loss - U.S. News...,iud use tied modest weight loss u news world r...,2012-05-08T20:12:38.000Z,2012,5,199954986996727808,199954986996727808,0,0,0,0,iud,13


In [13]:
twitter_replies_df.sample(3)

Unnamed: 0.1,Unnamed: 0,source,text,tokens_text,text_type,date,year,month,id,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens
39406,51008,twitter-replies,Ugh this is frustrating I have had an IUD it ...,ugh frustrating iud hell absolute hell got pai...,iud,2019-02-28T03:55:20.000Z,2019,2,1100967665064165376,1100949389156085761,0,1,3,0,26
53513,69490,twitter-replies,I had a really traumatic experience trying to...,really traumatic experience trying get iud pla...,iud,2018-08-15T22:33:49.000Z,2018,8,1029858730605666304,1029851812264902657,0,1,6,0,27
109289,141983,twitter-replies,I guess its possible lol I spotted like to da...,guess possible lol spotted like days ago rando...,iud,2013-04-26T05:16:08.000Z,2013,4,327652291656159232,327651921970221056,0,0,0,0,15


In [14]:
twitter_replies_df['year'].value_counts(normalize=True)

2020    0.243402
2019    0.224950
2018    0.128775
2017    0.089921
2014    0.055131
2012    0.053545
2013    0.052455
2016    0.051322
2015    0.041737
2011    0.034503
2010    0.016895
2009    0.006791
2008    0.000543
2007    0.000028
Name: year, dtype: float64

In [15]:
twitter_posts_df['year'].value_counts(normalize=True)

2012    0.116656
2019    0.109375
2020    0.097354
2015    0.096898
2016    0.093218
2017    0.091539
2013    0.090409
2018    0.089238
2011    0.082324
2014    0.069548
2010    0.049764
2009    0.012667
2008    0.000866
2007    0.000144
Name: year, dtype: float64

## Combine

In [16]:
combined_df = pd.concat([reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df])
len(combined_df)

1063672

In [17]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

<br><br>

# Get basic stats for each dataset

In [18]:
datasets = ['reddit-posts', 'reddit-comments', 'twitter-posts', 'twitter-replies', 'webmd-reviews']

In [19]:
for _dataset in datasets:

    _df = combined_df[combined_df['source'] == _dataset]

    _word_count_dict = defaultdict(int)
    for i, r in _df.iterrows(): 
        for _word in str(r['tokens_text']).split():
            _word_count_dict[_word] += 1

    _num_documents = len(_df.index)
    _vocab_size = len(_word_count_dict.keys())
    _mean_tokens_per_doc = np.mean([len(str(r['tokens_text']).split()) for i, r in _df.iterrows()])

    print(_dataset)
    print('Number of Documents:', _num_documents)
    print('Vocabulary Size:', _vocab_size)
    print('Mean Number of Tokens per Document:', _mean_tokens_per_doc)
    print('==================================')

reddit-posts
Number of Documents: 68958
Vocabulary Size: 49088
Mean Number of Tokens per Document: 79.41001769192842
reddit-comments
Number of Documents: 264912
Vocabulary Size: 67837
Mean Number of Tokens per Document: 31.762543788125868
twitter-posts
Number of Documents: 499796
Vocabulary Size: 398910
Mean Number of Tokens per Document: 12.342963929283147
twitter-replies
Number of Documents: 211896
Vocabulary Size: 73896
Mean Number of Tokens per Document: 12.25912711896402
webmd-reviews
Number of Documents: 18110
Vocabulary Size: 17487
Mean Number of Tokens per Document: 44.84914411927112


<br><br>

# Get sparklines over years

In [20]:
positions = []
for i in range(0, 15):
    _width = 1/15
    positions.append((_width*i) + (_width/2))

In [21]:
years = [2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

for _dataset in datasets:

    _df = combined_df[combined_df['source'] == _dataset]
    _total = len(_df.index)

    _heights = []
    for _year in years:
        _heights.append(len(_df[_df['year'] == _year].index) / float(_total))

    _max_height = 0.9
    _adjustor = _max_height / max(_heights)
    _heights = [h*_adjustor for h in _heights]

    print(_dataset)
    print(len(years), len(positions), len(_heights))
    for _position, _height in zip(positions, _heights):
        print('\sparkspike ' + str(_position) + ' ' + str(_height))
    print('=========================================')

reddit-posts
15 15 15
\sparkspike 0.03333333333333333 0.0
\sparkspike 0.1 0.0
\sparkspike 0.16666666666666666 0.0
\sparkspike 0.23333333333333334 0.0
\sparkspike 0.3 0.0
\sparkspike 0.36666666666666664 0.005445744251714401
\sparkspike 0.43333333333333335 0.042839854780153286
\sparkspike 0.5 0.07678499394917306
\sparkspike 0.5666666666666667 0.11308995562726905
\sparkspike 0.6333333333333333 0.15615671641791046
\sparkspike 0.7 0.19763513513513514
\sparkspike 0.7666666666666666 0.3000605082694635
\sparkspike 0.8333333333333334 0.4996470350947963
\sparkspike 0.9 0.8377369907220653
\sparkspike 0.9666666666666667 0.9
reddit-comments
15 15 15
\sparkspike 0.03333333333333333 0.0
\sparkspike 0.1 0.0
\sparkspike 0.16666666666666666 0.0
\sparkspike 0.23333333333333334 0.0
\sparkspike 0.3 0.0
\sparkspike 0.36666666666666664 0.00546811055845873
\sparkspike 0.43333333333333335 0.061820814707548945
\sparkspike 0.5 0.11436284077315369
\sparkspike 0.5666666666666667 0.184824970093811
\sparkspike 0.633