In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)

import little_mallet_wrapper as lmw

In [3]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

# topics_directory_path       = output_directory_path + '/topics-by-leann/undersampled-posts'
# webmd_topics_directory_path = output_directory_path + '/topics-by-leann/undersampled-to-webmd'
# full_topics_directory_path  = output_directory_path + '/topics-by-leann/undersampled-to-full'
# # topics_directory_path = output_directory_path + '/topics-by-me/under-sampled'

# undersampled_path = data_directory_path + '/reddit/pill-iud-implant-sampled-posts.csv'
# # full_path         = data_directory_path + '/reddit/un-sampled-posts.csv'
# full_path         = data_directory_path + '/reddit/un-sampled-posts-w-condom.csv'
# webmd_path        = data_directory_path + '/webmd/webmd_2020_iud_implant_pill.csv'

# num_topics = 30

In [2]:
keyword_type_dict = {'iud': 'iud',
                     'nexplanon': 'implant',
                     'implanon': 'implant',
                     'birth control implant': 'implant',
                     'bc pill': 'pill',
                     'birth control pill': 'pill',
                     'contraceptive pill': 'pill'}

<br><br>

# Load data

In [None]:
data_dicts = []

error_count = 0

for _file_name in os.listdir(data_directory_path + '/twitter/without-retweets'):
    if _file_name.endswith('.csv'):

        _keyword = ' '.join(_file_name.split('_')[0].split('-')).lower()

        for _line in open(data_directory_path + '/twitter/without-retweets/' + _file_name, 'r'):

            if len(_line.split()) >= 6:

                _text = ' '.join(_line.split()[5:])
                _date = _line.split()[1] + ' ' + _line.split()[2]
                _year = _date.split('-')[0]

                if _year in ['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009']:
                    data_dicts.append({'date': _date,
                                       'year': _year,
                                       'text': _text,
                                       'type': 'tweet',
                                       'text_type': keyword_type_dict[_keyword]})

twitter_df = pd.DataFrame(data_dicts)

In [None]:
len(twitter_df)

In [None]:
twitter_df['text_type'].value_counts()

In [None]:
twitter_df['year'].value_counts()

In [None]:
twitter_df['token_text'] = twitter_df['text'].apply(lmw.process_string)

In [None]:
twitter_df = twitter_df[twitter_df['token_text'].str.len() >= 50]

In [153]:
twitter_df = twitter_df[twitter_df['token_text'].str.contains('|'.join(list(keyword_type_dict.keys())))]

In [163]:
len(twitter_df)

765857

In [155]:
twitter_df.sample(5)

Unnamed: 0,date,year,text,type,text_type,token_text
455371,2018-10-01 19:10:11,2018,@_amelight @estoesflores Q cagada. Y si probas...,tweet,implant,amelight estoesflores cagada probas con implan...
969954,2018-06-21 19:23:24,2018,Sometimes people get IUDs after having kids an...,tweet,iud,sometimes people get iuds kids stop getting pe...
501665,2011-05-25 04:39:17,2011,"Survey Question:“The use of condoms, IUD and p...",tweet,iud,survey question use condoms iud pills also con...
1262947,2011-11-09 03:24:04,2011,"Cet aprem, je suis seule pour tout le CHS. Et ...",tweet,implant,cet aprem suis seule pour tout chs dois enleve...
1184570,2020-08-19 19:46:14,2020,made an appointment to finally get my goddamn ...,tweet,iud,made appointment finally get goddamn iud check...


In [None]:
twitter_df['id'] = twitter_df['Unnamed: 0']

In [156]:
twitter_df.to_csv(data_directory_path + '/twitter/without-retweets/twitter.csv')

<br><br>

# Get dataset stats

In [108]:
word_count_dict = defaultdict(int)

for i, r in twitter_df.iterrows():
    for _word in r['token_text'].lower().split():
        word_count_dict[_word] += 1

In [109]:
len(word_count_dict)

718579

In [111]:
lengths = []
for i, r in twitter_df.iterrows():
    lengths.append(len(r['token_text'].split()))
np.mean(lengths)

14.661276191247191

In [115]:
twitter_df['year'].value_counts().mean()

63821.416666666664

In [116]:
twitter_df['text_type'].value_counts()

iud        464867
pill       211695
implant     89295
Name: text_type, dtype: int64

In [117]:
464867/765857, 211695/765857, 89295/765857

(0.6069892943460724, 0.2764158321984391, 0.11659487345548843)

<br><br>

# Undersample tweets by birth control type

In [157]:
sampled_df = twitter_df.groupby('text_type').apply(lambda x: x.sample(50000)).reset_index(drop=True)

In [158]:
len(sampled_df)

150000

In [159]:
sampled_df['text_type'].value_counts()

iud        50000
pill       50000
implant    50000
Name: text_type, dtype: int64

In [160]:
sampled_df['year'].value_counts()

2020    23828
2019    21247
2018    15434
2012    14420
2015    12858
2013    11736
2017    11700
2014    11400
2016    11102
2011     9376
2010     5346
2009     1553
Name: year, dtype: int64

In [161]:
sampled_df.to_csv(data_directory_path + '/twitter/without-retweets/twitter.undersampled.csv')

<br><br>

# Get sparklines

In [162]:
twitter_df['year'].value_counts()

2020    123231
2019    108414
2018     77503
2012     70376
2015     66370
2017     64758
2014     62392
2013     58169
2016     56873
2011     45093
2010     25213
2009      7465
Name: year, dtype: int64

In [218]:
year_count_dict = twitter_df['year'].value_counts()

x = []

i = 3
for _year in ['2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']:
    print('\\sparkspike ' + str((i/14)-0.03571428571428571) + ' ' + str(round(year_count_dict[_year] / float(765857) * 5, 4)))
    i += 1
    x.append(year_count_dict[_year] / float(765857))

\sparkspike 0.17857142857142855 0.0487
\sparkspike 0.25 0.1646
\sparkspike 0.32142857142857145 0.2944
\sparkspike 0.39285714285714285 0.4595
\sparkspike 0.4642857142857143 0.3798
\sparkspike 0.5357142857142857 0.4073
\sparkspike 0.6071428571428572 0.4333
\sparkspike 0.6785714285714286 0.3713
\sparkspike 0.75 0.4228
\sparkspike 0.8214285714285714 0.506
\sparkspike 0.8928571428571429 0.7078
\sparkspike 0.9642857142857143 0.8045


In [180]:
1 / 12

0.08333333333333333

In [181]:
0.08333333333333333/2

0.041666666666666664

In [182]:
data_directory_path = '/Volumes/Passport-1/data/birth-control'
reddit_path = data_directory_path + '/reddit/un-sampled-posts.csv'

reddit_df = pd.read_csv(reddit_path)

In [185]:
len(reddit_df)

89035

In [183]:
reddit_df['year'].value_counts()

2020.0    25818
2019.0    23837
2018.0    14073
2017.0     8361
2016.0     5585
2015.0     4417
2014.0     3240
2013.0     2278
2012.0     1259
2011.0      167
Name: year, dtype: int64

In [219]:
year_count_dict = reddit_df['year'].value_counts()

len(year_count_dict)

10

In [187]:
1/10, 0.1/2

(0.1, 0.05)

In [222]:
i = 5
for _year in ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']:
    print('\\sparkspike ' + str((i/14)-0.03571428571428571) + ' ' + str(round(year_count_dict[float(_year)] / float(89035) * 3, 4)))
    i += 1

\sparkspike 0.32142857142857145 0.0056
\sparkspike 0.39285714285714285 0.0424
\sparkspike 0.4642857142857143 0.0768
\sparkspike 0.5357142857142857 0.1092
\sparkspike 0.6071428571428572 0.1488
\sparkspike 0.6785714285714286 0.1882
\sparkspike 0.75 0.2817
\sparkspike 0.8214285714285714 0.4742
\sparkspike 0.8928571428571429 0.8032
\sparkspike 0.9642857142857143 0.8699


In [200]:
webmd_path = data_directory_path + '/webmd/webmd_2020.csv'

webmd_df = pd.read_csv(webmd_path)

In [201]:
len(webmd_df)

20223

In [204]:
webmd_df['year'].value_counts()

2012    2673
2011    2607
2009    2582
2010    2318
2013    2025
2008    1893
2014    1808
2015    1168
2016    1033
2017     551
2007     445
2018     435
2019     399
2020     286
Name: year, dtype: int64

In [208]:
year_count_dict = webmd_df['year'].value_counts()

len(year_count_dict)

14

In [210]:
1/14, 0.07142857142857142/2

(0.07142857142857142, 0.03571428571428571)

In [217]:
i = 1
for _year in ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']:
    print('\\sparkspike ' + str((i/14)-0.0357142857) + ' ' + str(round(year_count_dict[float(_year)] / float(20223) * 7, 4)))
    i += 1

\sparkspike 0.035714285728571424 0.154
\sparkspike 0.10714285715714285 0.6552
\sparkspike 0.1785714285857143 0.8937
\sparkspike 0.2500000000142857 0.8024
\sparkspike 0.32142857144285714 0.9024
\sparkspike 0.39285714287142853 0.9252
\sparkspike 0.4642857143 0.7009
\sparkspike 0.5357142857285714 0.6258
\sparkspike 0.6071428571571429 0.4043
\sparkspike 0.6785714285857143 0.3576
\sparkspike 0.7500000000142857 0.1907
\sparkspike 0.8214285714428571 0.1506
\sparkspike 0.8928571428714286 0.1381
\sparkspike 0.9642857143 0.099


There were no suggested changes from our reviewers, so all of our changes were for clarity, to include more related work, or to update the review quotations after receiving permission from the authors (as explained previously in Section 9).

- We made many small wording and aesthetic changes to the paper. For example, we changed the colors of some of the figures, fixed typos, and added small clarifications wherever necessary.

- We added summary labels to our contributions in Section 1.

- We slightly extended our discussion of one of the references in Section 2.2.

- We extended our explanation in Section 3.2 of why we restrict our analysis to 20 genres.

- We added an example review to the end of Section 3.4.

- We extend our explanation in Section 5 about why we rely on this classifier for surprisal scores.

- We removed Figure 8 and the paragraph describing it at the end of Section 7. After re-reading the paper, this figure and its explanation seem inessential and added little to the understanding generated in the rest of the section. If we should put this figure back into the paper, we're happy to do so.

- We added a paragraph of references and discussion to Section 8: Audience and Reception to better include work from the digital humanities.

- We replaced some of the quotations and added usernames based on the feedback we received after reaching out to the review authors. All quotations are now used with explicit permission.

In [4]:
twitter_df = pd.read_csv(data_directory_path + '/twitter/without-retweets/twitter.csv')

In [5]:
tweets = twitter_df.sample(1000)['text'].tolist()

In [6]:
for t in random.sample(tweets, 1000):
    print(t)

Colorado's investment in IUDs and other fire-and-forget birth control produced a "miracle." Teen births and... https://t.co/5GahvTZipS
When Your IUD Goes MIA http://t.co/muBliZSJcH #parenting #sometimesyoujusthavetolaugh
@jenevie073097 It’s actually not bad, well I got an IUD and I’ve had maybe 1 issue but I don’t wanna say it here lol but yeah that’s it and I’ve been having it for a year
Despite Democrat insistence, #HobbyLobby pays for ALL contraceptives with the exception of Plan B, Ella, and IUDs.
IUD or carton of cigarettes? That's a whole lotta packaging. #BirthControl http://t.co/kB9rJTGDnW
@NicCageMatch Even more awkward: discovering she had her name engraved on her IUD.
@ladylapresta girl i just got an IUD simply because i don’t know if i’ll be able to get BC at all after the election 😅
Go to cakeshop tonight ! Chorizo straight outta Mexico is back! Plus Escapeism + IUD + Shells… https://t.co/ZhFmY81K6i
i HIGHLY RECOMMEND skyla iud ( birth control ) to any girl that’s seriousl

In [11]:
for _line in open('labeled_tweets.txt', 'r'):
    _types = []
    for _keyword, _type in keyword_type_dict.items():
        if _keyword in _line.lower():
            _types.append(_type)
    if len(_types) > 0:
        print(_types[0])
    else:
        print()

iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
pill
iud
iud
iud
pill
pill
iud
pill
pill
iud
pill
pill
pill
iud
iud
iud
iud
iud
iud
iud
pill
pill
pill
iud
iud
iud
iud
iud
pill
pill
implant
implant
iud
pill
iud
iud
iud
pill
iud
pill
iud
iud
iud
iud
pill
iud
iud
iud
iud
iud
iud
pill

pill
iud
iud
pill
iud
pill
implant
iud
iud
iud
pill
iud
iud
iud
iud
implant
iud
pill
iud
iud
iud
iud
iud
iud
iud
iud
iud
pill
iud
pill
pill
iud
pill
iud
pill
pill
iud
iud
iud
pill
pill
pill
iud
pill
iud
pill
iud
iud
implant
iud
pill
pill
iud
implant
pill
iud
implant
iud
pill
pill
pill
implant

iud
iud
iud
pill
pill
iud
pill
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
implant
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
iud
implant
iud
implant
iud
iud
pill
iud
pill
iud
iud
pill
iud
pill
pill
iud
iud
iud
iud
iud

pill
iud
iud
iud
iud
pill
pill
implant
iud
pill
iud
iud
pill
pill
iud
iud
iud
implant

iud
iud
iud
pill
iud
pill
iud
implant
iud
pill
implant
iud
iud

implant
iud
iud
iud
iud
iud
iud
pill
iud
iud
i