In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)

import little_mallet_wrapper as lmw

In [2]:
scraped_directory_path = '/Volumes/Passport-1/data/birth-control/reddit/scraped'
data_directory_path   = '/Volumes/Passport-1/data/birth-control'

<br><br><br><br>

# Extract pill keywords from WebMD

In [3]:
webmd_pill_keywords = ['sprintec', 'yaz', 'tri-sprintec', 'loestrin', 'lo-loestrin-fe', 'trinessa', 'junel-fe', 'generess-fe', 'mononessa', 'seasonique', 'azurette', 
                       'minastrin', 'beyaz', 'ocella', 'lutera', 'ortho-tri-cyclen-lo', 'microgestin-fe', 'loryna', 'errin', 'gildess-fe', 'sronyx', 'necon', 
                       'tri-previfem', 'cryselle', 'yasmin', 'viorele', 'jolivette', 'aviane', 'natazia', 'reclipsen', 'apri', 'ortho-tri-cyclen', 
                       'femcon-fe-tablet-chewable', 'camila', 'jolessa', 'amethia', 'kariva', 'nora-be', 'quasense', 'ortho-cyclen', 'junel-fe', 'gianvi', 'low-ogestrel', 
                       'microgestin-fe', 'lybrel', 'enpresse', 'ortho-micronor', 'tri-estarylla', 'loseasonique', 'trivora', 'balziva', 'alesse', 'aubra', 'vestura', 
                       'lo-ovral', 'nortrel', 'camrese', 'levora', 'portia', 'chateal', 'zovia', 'orsythia', 'tri-lo-sprintec', 'nortrel', 'levora', 'mono-linyah', 
                       'microgestin', 'junel', 'norethindrone', 'previfem', 'nikki', 'syeda', 'lessina', 'zenchent', 'tarina-fe', 'loestrin-fe', 'falmina', 'tri-linyah', 
                       'tilia-fe', 'vienva', 'introvale', 'lomedia', 'desogen-tablet', 'norgestimate-ethinyl-estradiol', 'emoquette', 'alyacen', 'quartette', 'altavera', 
                       'daysee', 'kelnor', 'necon-triphasic', 'amethyst', 'nor-q-d-tablet', 'junel', 'mircette', 'safyral', 'caziant', 'micronor', 'enskyce', 'estarylla', 
                       'heather', 'tri-legest-fe', 'zarah', 'nortrel-triphasic', 'levlen', 'amethia-lo', 'ashlyna', 'ortho-tri-cyclen', 'blisovi-fe', 'microgestin', 'lyza', 
                       'camrese-lo', 'loestrin', 'solia-tablet', 'kurvelo', 'levonorgestrel-ec', 'ortho-novum', 'ortho-novum-triphasic', 'cyclafem', 'myzilra', 'necon', 
                       'norinyl', 'estrostep-fe', 'velivet', 'ovcon', 'cyclessa', 'ovcon', 'marlissa', 'sharobel', 'ortho-cept', 'ortho-cyclen', 'loestrin-fe', 
                       'zeosa-tablet-chewable', 'pimtrea', 'gildess', 'dasetta', 'necon', 'norgestrel-ethiny-estra', 'leena', 'larin-fe', 'triphasil', 'levora', 'larissia', 
                       'loestrin', 'nordette', 'desogestrel-ethinyl-estradiol', 'vyfemla', 'zenchent-fe', 'ogestrel', 'low-ogestrel', 'necon', 'norethindron-ethinyl-estradiol-tablet-contraceptives', 
                       'nortrel', 'demulen', 'dasetta-triphasic', 'tri-norinyl', 'tri-levlen', 'blisovi', 'zovia', 'isibloom', 'alesse']
webmd_pill_keywords += [n.replace('-', ' ') for n in webmd_pill_keywords if '-' in n]

', '.join(["'" + n + "'" for n in webmd_pill_keywords])

"'sprintec', 'yaz', 'tri-sprintec', 'loestrin', 'lo-loestrin-fe', 'trinessa', 'junel-fe', 'generess-fe', 'mononessa', 'seasonique', 'azurette', 'minastrin', 'beyaz', 'ocella', 'lutera', 'ortho-tri-cyclen-lo', 'microgestin-fe', 'loryna', 'errin', 'gildess-fe', 'sronyx', 'necon', 'tri-previfem', 'cryselle', 'yasmin', 'viorele', 'jolivette', 'aviane', 'natazia', 'reclipsen', 'apri', 'ortho-tri-cyclen', 'femcon-fe-tablet-chewable', 'camila', 'jolessa', 'amethia', 'kariva', 'nora-be', 'quasense', 'ortho-cyclen', 'junel-fe', 'gianvi', 'low-ogestrel', 'microgestin-fe', 'lybrel', 'enpresse', 'ortho-micronor', 'tri-estarylla', 'loseasonique', 'trivora', 'balziva', 'alesse', 'aubra', 'vestura', 'lo-ovral', 'nortrel', 'camrese', 'levora', 'portia', 'chateal', 'zovia', 'orsythia', 'tri-lo-sprintec', 'nortrel', 'levora', 'mono-linyah', 'microgestin', 'junel', 'norethindrone', 'previfem', 'nikki', 'syeda', 'lessina', 'zenchent', 'tarina-fe', 'loestrin-fe', 'falmina', 'tri-linyah', 'tilia-fe', 'vienv

In [7]:
remaining_reddit_pill_keywords = []
for _keyword in type_keywords_dict['pill']:
    if _keyword not in webmd_pill_keywords:
        remaining_reddit_pill_keywords.append(_keyword)
', '.join(["'" + n + "'" for n in remaining_reddit_pill_keywords])

"'oral contracept', 'oral birth control', 'pill', 'pills', 'minipill', 'levonorgestrel', 'femcon', 'desogestrel', 'larin', 'norgestimate', 'zeosa', 'tilia', 'desogen', 'nor', 'generess', 'norgestrel', 'estrostep', 'tarina', 'solia', 'loestren', 'loloestrin', 'gedarel', 'pack', 'placebo', 'tri cyclen', 'tricyclen', 'linessa', 'taytulla', 'lolo', 'tri jordyna', 'trijordyna'"

<br><br><br><br>

# Final keywords

In [6]:
type_keywords_dict = {'iud': ['iud', 'mirena', 'skyla', 'liletta', 'paragard', 'paraguard', 'kyleena', 'copper', 'coil'], 
                      'implant': ['implanon', 'nexplanon', 'implant', 'norplant'],
                      'ring': ['nuvaring', 'etonogestrel', 'ring', 'nuva ring'], 
                      'shot': ['shot',  'inject', 'injection', 'depo', 'provera', 'depoprovera'], 
                      'patch': ['ortho evra', 'xulane', 'patch'],
                      'pill': ['oral contracept', 'oral birth control', 'pill', 'pills', 'minipill', 'levonorgestrel', 'femcon', 'desogestrel', 'larin', 'norgestimate', 
                               'zeosa', 'tilia', 'desogen', 'nor', 'generess', 'norgestrel', 'estrostep', 'tarina', 'solia', 'loestren', 'loloestrin', 'gedarel', 'pack', 
                               'placebo', 'tri cyclen', 'tricyclen', 'linessa', 'taytulla', 'lolo', 'tri jordyna', 'trijordyna',
                               'sprintec', 'yaz', 'tri-sprintec', 'loestrin', 'lo-loestrin-fe', 'trinessa', 'junel-fe', 'generess-fe', 'mononessa', 'seasonique', 'azurette', 
                               'minastrin', 'beyaz', 'ocella', 'lutera', 'ortho-tri-cyclen-lo', 'microgestin-fe', 'loryna', 'errin', 'gildess-fe', 'sronyx', 'necon', 
                               'tri-previfem', 'cryselle', 'yasmin', 'viorele', 'jolivette', 'aviane', 'natazia', 'reclipsen', 'apri', 'ortho-tri-cyclen', 'femcon-fe-tablet-chewable', 
                               'camila', 'jolessa', 'amethia', 'kariva', 'nora-be', 'quasense', 'ortho-cyclen', 'junel-fe', 'gianvi', 'low-ogestrel', 'microgestin-fe', 
                               'lybrel', 'enpresse', 'ortho-micronor', 'tri-estarylla', 'loseasonique', 'trivora', 'balziva', 'alesse', 'aubra', 'vestura', 'lo-ovral', 'nortrel', 
                               'camrese', 'levora', 'portia', 'chateal', 'zovia', 'orsythia', 'tri-lo-sprintec', 'nortrel', 'levora', 'mono-linyah', 'microgestin', 'junel', 
                               'norethindrone', 'previfem', 'nikki', 'syeda', 'lessina', 'zenchent', 'tarina-fe', 'loestrin-fe', 'falmina', 'tri-linyah', 'tilia-fe', 'vienva', 
                               'introvale', 'lomedia', 'desogen-tablet', 'norgestimate-ethinyl-estradiol', 'emoquette', 'alyacen', 'quartette', 'altavera', 'daysee', 'kelnor', 
                               'necon-triphasic', 'amethyst', 'nor-q-d-tablet', 'junel', 'mircette', 'safyral', 'caziant', 'micronor', 'enskyce', 'estarylla', 'heather', 
                               'tri-legest-fe', 'zarah', 'nortrel-triphasic', 'levlen', 'amethia-lo', 'ashlyna', 'ortho-tri-cyclen', 'blisovi-fe', 'microgestin', 'lyza', 'camrese-lo', 
                               'loestrin', 'solia-tablet', 'kurvelo', 'levonorgestrel-ec', 'ortho-novum', 'ortho-novum-triphasic', 'cyclafem', 'myzilra', 'necon', 'norinyl', 
                               'estrostep-fe', 'velivet', 'ovcon', 'cyclessa', 'ovcon', 'marlissa', 'sharobel', 'ortho-cept', 'ortho-cyclen', 'loestrin-fe', 'zeosa-tablet-chewable', 
                               'pimtrea', 'gildess', 'dasetta', 'necon', 'norgestrel-ethiny-estra', 'leena', 'larin-fe', 'triphasil', 'levora', 'larissia', 'loestrin', 'nordette', 
                               'desogestrel-ethinyl-estradiol', 'vyfemla', 'zenchent-fe', 'ogestrel', 'low-ogestrel', 'necon', 'norethindron-ethinyl-estradiol-tablet-contraceptives', 
                               'nortrel', 'demulen', 'dasetta-triphasic', 'tri-norinyl', 'tri-levlen', 'blisovi', 'zovia', 'isibloom', 'alesse', 'tri sprintec', 'lo loestrin fe', 
                               'junel fe', 'generess fe', 'ortho tri cyclen lo', 'microgestin fe', 'gildess fe', 'tri previfem', 'ortho tri cyclen', 'femcon fe tablet chewable', 
                               'nora be', 'ortho cyclen', 'junel fe', 'low ogestrel', 'microgestin fe', 'ortho micronor', 'tri estarylla', 'lo ovral', 'tri lo sprintec', 'mono linyah', 
                               'tarina fe', 'loestrin fe', 'tri linyah', 'tilia fe', 'desogen tablet', 'norgestimate ethinyl estradiol', 'necon triphasic', 'nor q d tablet', 
                               'tri legest fe', 'nortrel triphasic', 'amethia lo', 'ortho tri cyclen', 'blisovi fe', 'camrese lo', 'solia tablet', 'levonorgestrel ec', 'ortho novum', 
                               'ortho novum triphasic', 'estrostep fe', 'ortho cept', 'ortho cyclen', 'loestrin fe', 'zeosa tablet chewable', 'norgestrel ethiny estra', 'larin fe', 
                               'desogestrel ethinyl estradiol', 'zenchent fe', 'low ogestrel', 'norethindron ethinyl estradiol tablet contraceptives', 'dasetta triphasic', 'tri norinyl', 'tri levlen'],
                     'emergency': ['plan b', 'emergency contraception', 'morning after', 'morningafter', 'norlevo'],
                     'barrier': ['condom', 'condoms', 'diaphragm', 'diaphram', 'barrier', 'spermicide', 'cap', 'sponge', 'vcf', 'encare', 'conceptrol', 'foam', 'film'],
                     'sterilization': ['sterilize', 'sterilization', 'sterilise', 'sterilisation', 'tubes tied', 'tie my tubes', 'vasectomy'],
                     'withdrawal': ['withdrawal', 'withdraw', 'pull out'],
                     'ring': ['ring', 'nuvaring'],
                     'periodic abstinence': ['rhythm method', 'natural family planning', 'nfp', 'fam', 'fertility awareness', 
                                             'symptothermal', 'sympto-thermal', 'sympto thermal', 'basal body', 'natural cycles',
                                             'calendar method', 'bbt', 'cervical mucus method', 'kindara', 'fertility tracking']}

  

In [8]:
for _type in type_keywords_dict.keys():
    type_keywords_dict[_type] = list(set(type_keywords_dict[_type]))

for _type, _words in type_keywords_dict.items():
    assert len(_words) == len(list(set(_words)))

In [9]:
for _type, _keywords in type_keywords_dict.items():
    print(_type)
    print(', '.join(sorted(_keywords)))
    print()

iud
coil, copper, iud, kyleena, liletta, mirena, paragard, paraguard, skyla

implant
implanon, implant, nexplanon, norplant

ring
nuvaring, ring

shot
depo, depoprovera, inject, injection, provera, shot

patch
ortho evra, patch, xulane

pill
alesse, altavera, alyacen, amethia, amethia lo, amethia-lo, amethyst, apri, ashlyna, aubra, aviane, azurette, balziva, beyaz, blisovi, blisovi fe, blisovi-fe, camila, camrese, camrese lo, camrese-lo, caziant, chateal, cryselle, cyclafem, cyclessa, dasetta, dasetta triphasic, dasetta-triphasic, daysee, demulen, desogen, desogen tablet, desogen-tablet, desogestrel, desogestrel ethinyl estradiol, desogestrel-ethinyl-estradiol, emoquette, enpresse, enskyce, errin, estarylla, estrostep, estrostep fe, estrostep-fe, falmina, femcon, femcon fe tablet chewable, femcon-fe-tablet-chewable, gedarel, generess, generess fe, generess-fe, gianvi, gildess, gildess fe, gildess-fe, heather, introvale, isibloom, jolessa, jolivette, junel, junel fe, junel-fe, kariva, k

<br><br>

# Functions

In [10]:
def get_all_types_from_post(r):
    
    test_tokens = r['tokens_text'].split()

    # Count how many times each type appears in the text
    type_count_dict = defaultdict(int)
    for _type, _keywords in type_keywords_dict.items():
        for _word in _keywords:
            if len(_word.split()) == 1:
                type_count_dict[_type] += len([t for t in test_tokens if t == _word])
            elif len(_word.split()) > 1:
                type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

    if len(type_count_dict) > 0:
        return list(set([_type for _type, _count in type_count_dict.items() if _count > 0]))

    # If there were no type mentions at all, return this
    return 'unknown'

In [11]:
def get_parent_type(r):

    if str(r['id']) in comment_parent_dict and comment_parent_dict[str(r['id'])] in post_type_dict:
        return post_type_dict[comment_parent_dict[str(r['id'])]]

    return 'unknown'


def get_all_types_from_comment(r):

    test_tokens = str(r['tokens_text']).split()

    # Count how many times each type appears in the text
    type_count_dict = defaultdict(int)
    for _type, _keywords in type_keywords_dict.items():
        for _word in _keywords:
            if len(_word.split()) == 1:
                type_count_dict[_type] += len([t for t in test_tokens if t == _word])
            elif len(_word.split()) > 1:
                type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

    # If found any types, return them
    if len(type_count_dict) > 0:
        return list(set([_type for _type, _count in type_count_dict.items() if _count > 0]))

    # If there were no type mentions at all, and the parent post couldn't be found, return this 
    return 'unknown'

In [12]:
def get_year(x):
    try:
        return datetime.utcfromtimestamp(int(x)).year
    except:
        return 'Unknown'

In [13]:
def get_month(x):
    try:
        return datetime.utcfromtimestamp(int(x)).month
    except:
        return 'Unknown'

In [14]:
def get_tokens(r):
    if 'title' in r:
        return lmw.process_string(str(r['title']), remove_short_words=False) + ' ' + lmw.process_string(str(r['selftext']), remove_short_words=False)
    return lmw.process_string(str(r['selftext']), remove_short_words=False)

def get_tokens_from_comment(r):
    return lmw.process_string(str(r['body']), remove_short_words=False)

<br><br>

# **Process posts**

### Load scraped posts

In [46]:
posts_df_list = []
for _subdir, _dirs, _files in os.walk(scraped_directory_path + '/posts'):
    for _file_name in _files:
        if _file_name.endswith('.csv'):
            posts_df_list.append(pd.read_csv(_subdir + '/' + _file_name))

posts_df = pd.concat(posts_df_list)
len(posts_df.index)

94153

### Get tokenized text

In [47]:
posts_df['selftext'] = posts_df['selftext'].astype(str)
posts_df['tokens_text'] = posts_df.apply(get_tokens, axis=1)

In [48]:
word_count_dict = defaultdict(int)

for i, r in posts_df.iterrows():

    test_tokens = r['tokens_text'].split()

    # Count how many times each type appears in the text
    for _word in type_keywords_dict['pill']:
        if len(_word.split()) == 1:
            word_count_dict[_word] += len([t for t in test_tokens if t == _word])
        elif len(_word.split()) > 1:
            word_count_dict[_word] += len(re.findall(_word, ' '.join(test_tokens)))

found_words = []
for _word, _count in sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True):
    if _count > 0:
        found_words.append(_word)
', '.join(sorted(found_words))

'alesse, altavera, alyacen, amethia, amethia lo, amethyst, apri, ashlyna, aubra, aviane, azurette, balziva, beyaz, blisovi, blisovi fe, camila, camrese, camrese lo, caziant, chateal, cryselle, cyclafem, cyclessa, dasetta, daysee, demulen, desogen, desogestrel, desogestrel ethinyl estradiol, emoquette, enpresse, enskyce, errin, estarylla, estrostep, estrostep fe, falmina, femcon, gedarel, generess, generess fe, gianvi, gildess, gildess fe, heather, introvale, isibloom, jolessa, jolivette, junel, junel fe, kariva, kelnor, kurvelo, larin, larin fe, larissia, leena, lessina, levlen, levonorgestrel, levonorgestrel ec, levora, linessa, lo loestrin fe, lo ovral, loestren, loestrin, loestrin fe, lolo, loloestrin, lomedia, loryna, loseasonique, low ogestrel, lutera, lybrel, lyza, marlissa, microgestin, microgestin fe, micronor, minastrin, minipill, mircette, mono linyah, mononessa, myzilra, natazia, necon, necon triphasic, nikki, nora be, nordette, norethindrone, norgestimate, norgestimate ethi

### Get birth control type from post text, dropping posts that don't have a type or that aren't in our three target types

In [49]:
posts_df['text_type'] = posts_df.apply(get_all_types_from_post, axis=1)

In [50]:
type_count_dict = defaultdict(int)
for i, r in posts_df.iterrows():
    for _type in r['text_type']:
        type_count_dict[_type] += 1

for _type, _count in sorted(type_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', round(_count/float(len(posts_df.index)), 3), '\t', _type)

54507 	 0.579 	 pill
33827 	 0.359 	 iud
13543 	 0.144 	 implant
12267 	 0.13 	 barrier
6485 	 0.069 	 shot
5226 	 0.056 	 emergency
4884 	 0.052 	 ring
3045 	 0.032 	 withdrawal
2436 	 0.026 	 patch
878 	 0.009 	 sterilization
517 	 0.005 	 periodic abstinence


In [51]:
len(posts_df.index)

94153

In [52]:
4367/94153

0.04638195277898739

In [53]:
post_type_dict = {str(r['id']): r['text_type'] for i, r in posts_df.iterrows()}

In [54]:
def convert_to_target_types(types_list):
    types_list = [_ for _ in types_list if _ in ['pill', 'iud', 'implant']]
    if types_list:
        return sorted(types_list)
    return 'unknown'

posts_df['text_type'] = posts_df['text_type'].apply(convert_to_target_types)
posts_df = posts_df[posts_df['text_type'] != 'unknown']
len(posts_df.index)

82041

In [55]:
posts_df['text_type'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[pill]                  38797
[iud]                   19708
[iud, pill]              9993
[implant]                6250
[implant, pill]          3167
[implant, iud, pill]     2550
[implant, iud]           1576
Name: text_type, dtype: int64

In [None]:
76365/94153

0.8110734655295104

### Get year and month and remove comments whose years can't be found

In [None]:
posts_df['year'] = posts_df['created_utc'].apply(get_year)
posts_df['month'] = posts_df['created_utc'].apply(get_month)
posts_df = posts_df[(posts_df['year'] != 'Unknown') & (posts_df['year'] != 1970)]
len(posts_df.index)

82041

In [None]:
# for i, r in posts_df[posts_df['text_type'] == 'implant'].sample(10).iterrows():
#     print(r['title'])
#     print()
#     print(r['text'])
#     print()
#     print('============================================')
#     print()

In [None]:
posts_df['year'].value_counts()

2020    26691
2019    20736
2018    12347
2017     7450
2016     4930
2015     3880
2014     2831
2013     1955
2012     1083
2011      138
Name: year, dtype: int64

### Remove short posts

In [None]:
def get_num_tokens(text):
    if pd.isnull(text):
        return 0
    return len(text.split())

posts_df['num_tokens'] = posts_df['selftext'].apply(get_num_tokens)

In [None]:
# The token length requirement also handles deleted comments, which are replaced with the single "[deleted]" token.
posts_df = posts_df[(posts_df['num_tokens'] >= 3)]
len(posts_df.index)

81972

### Remove duplicate posts

In [None]:
posts_df = posts_df.drop_duplicates(subset='selftext')
len(posts_df)

81596

### Remove unnecessary columns, rename columns

In [None]:
posts_df = posts_df[['id', 'created_utc', 'selftext', 'title', 'year', 'month', 'url', 'link_flair_text', 'tokens_text', 'text_type']]
posts_df = posts_df.rename(columns={'selftext': 'text'})
posts_df['source'] = 'reddit-posts'

### Remove November-December 2020 to match WebMD

In [None]:
posts_df['year'].value_counts()

2020    26434
2019    20633
2018    12294
2017     7442
2016     4928
2015     3876
2014     2825
2013     1948
2012     1079
2011      137
Name: year, dtype: int64

In [None]:
posts_df = posts_df[~((posts_df['year'] == 2020) & (posts_df['month'].isin([11, 12])))]
len(posts_df.index)

77231

In [None]:
posts_df['year'].value_counts()

2020    22069
2019    20633
2018    12294
2017     7442
2016     4928
2015     3876
2014     2825
2013     1948
2012     1079
2011      137
Name: year, dtype: int64

### Final dataframe

In [None]:
len(posts_df.index)

77231

In [None]:
posts_df['text_type'].value_counts()

TypeError: unhashable type: 'list'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'


[pill]                  36492
[iud]                   18482
[iud, pill]              9444
[implant]                5904
[implant, pill]          2991
[implant, iud, pill]     2431
[implant, iud]           1487
Name: text_type, dtype: int64

In [None]:
posts_df['year'].value_counts()

2020    22069
2019    20633
2018    12294
2017     7442
2016     4928
2015     3876
2014     2825
2013     1948
2012     1079
2011      137
Name: year, dtype: int64

In [None]:
posts_df.sample(3)

Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,text_type,source
1710,iqi321,1599791816,I was able to get my IUD today! Going in I kne...,Paragard IUD insertion experience,2020,9,https://www.reddit.com/r/birthcontrol/comments...,Experience,paragard iud insertion experience able get iud...,[iud],reddit-posts
612,7wf0ii,1518197201,Wondering if anyone had had a simalar experien...,Iud strings missing,2018,2,https://www.reddit.com/r/birthcontrol/comments...,Experience,iud strings missing wondering anyone simalar e...,[iud],reddit-posts
578,g6f0hi,1587610541,Hi! \n\nI’m considering going off of the pill ...,Changes after going off birth control?!,2020,4,https://www.reddit.com/r/birthcontrol/comments...,Experience,changes going birth control hi m considering g...,"[iud, pill]",reddit-posts


In [39]:
# posts_df.to_csv(data_directory_path + '/final-data/reddit_posts.csv')

<br><br>

# Process comments

### Load the scraped comments

In [40]:
comments_df_list = []
for _subdir, _dirs, _files in os.walk(scraped_directory_path + '/comments'):
    for _file_name in _files:
        if _file_name.endswith('.csv'):
            comments_df_list.append(pd.read_csv(_subdir + '/' + _file_name))

comments_df = pd.concat(comments_df_list)
len(comments_df.index)

492713

### Tokenize text

In [41]:
comments_df['body'] = comments_df['body'].astype(str)
comments_df['tokens_text'] = comments_df.apply(get_tokens_from_comment, axis=1)

### Trace parent tree

In [42]:
comment_parent_dict = {}
for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type == 't3':
            comment_parent_dict[str(r['id'])] = _parent

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

251584
365373
425784
454244
469090
476854
481292
483858
485462
486484
487195
487683


### Get the birth control type from either the comment or the parent post (drop if neither method works)

In [43]:
# # for i, r in comments_df[comments_df['text_type'] == 'unknown'].sample(10).iterrows():

# # print(' '.join(r['body'].split()))

# # test_tokens = str(r['tokens_text']).split()

# test_tokens = 'Gotcha! Well in that case I say try out the Nexplanon. You also may be able to get your option through Planned Parenthood or Nurx. They may be able to supply you with enough to last you the entire year if you want to try a pill, ring, or patch. But since hormones are not an issue and if you still have insurance that will cover it, I say for for the Nexplanon. If that doesn\'t work, try the other options I listed. I hope you find something that works for you!'
# test_tokens = lmw.process_string(test_tokens).split()
# print(test_tokens)

# # Count how many times each type appears in the text
# type_count_dict = defaultdict(int)
# for _type, _keywords in type_keywords_dict.items():
#     for _word in _keywords:
#         if len(_word.split()) == 1:
#             type_count_dict[_type] += len([t for t in test_tokens if t == _word])
#         elif len(_word.split()) > 1:
#             type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

# print(type_count_dict)

# # Get the maximum number of times any type appears in the text 
# max_count = max(type_count_dict.values())
# if max_count != 0:
#     max_keys = {k for k, v in type_count_dict.items() if v == max_count}
#     print(random.sample(max_keys, 1)[0])

# # If no assignment, then assign to parent post type
# # elif len(str(r['parent_id']).split('_')) > 1:
# #     print(r['parent_id'])
# #     print(r['id'])
# #     parent_id = str(r['parent_id']).split('_')[1] 
# #     if parent_id in post_type_dict:
# #         print(post_type_dict[parent_id])

# # If there were no type mentions at all, and the parent post couldn't be found, return this 
# print('unknown')

# print()

In [44]:
comments_df['text_type'] = comments_df.apply(get_all_types_from_comment, axis=1)

In [45]:
comments_df['parent_type'] = comments_df.apply(get_parent_type, axis=1)

NameError: name 'post_type_dict' is not defined

In [None]:
type_count_dict = defaultdict(int)
for i, r in comments_df.iterrows():
    for _type in r['text_type']:
        type_count_dict[_type] += 1

for _type, _count in sorted(type_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', round(_count/float(len(comments_df.index)), 3), '\t', _type)

In [None]:
comments_df['text_type'].value_counts()

unknown                230488
pill                   100649
iud                     87709
implant                 23379
barrier                 14735
shot                    10344
emergency                8321
ring                     6836
withdrawal               3343
patch                    2887
periodic abstinence      2070
sterilization            1952
Name: text_type, dtype: int64

In [None]:
comments_df['text_type'].value_counts(normalize=True)

unknown                0.467794
pill                   0.204275
iud                    0.178012
implant                0.047450
barrier                0.029906
shot                   0.020994
emergency              0.016888
ring                   0.013874
withdrawal             0.006785
patch                  0.005859
periodic abstinence    0.004201
sterilization          0.003962
Name: text_type, dtype: float64

In [414]:
comments_df['parent_type'].value_counts()

pill                   163454
iud                    152538
unknown                 66614
implant                 42055
barrier                 21780
shot                    15837
ring                    10768
emergency               10474
patch                    4542
withdrawal               1831
sterilization            1569
periodic abstinence      1251
Name: parent_type, dtype: int64

In [415]:
comments_df['parent_type'].value_counts(normalize=True)

pill                   0.331743
iud                    0.309588
unknown                0.135198
implant                0.085354
barrier                0.044204
shot                   0.032142
ring                   0.021855
emergency              0.021258
patch                  0.009218
withdrawal             0.003716
sterilization          0.003184
periodic abstinence    0.002539
Name: parent_type, dtype: float64

In [416]:
len(comments_df.index)

492713

In [417]:
# comments_df = comments_df[(comments_df['text_type'].isin(['pill', 'iud', 'implant'])) | ((comments_df['text_type'] == 'unknown') & (comments_df['parent_type'].isin(['pill', 'iud', 'implant'])))]

def add_parent_type(r):
    if r['text_type'] == 'unknown':
        return r['parent_type']
    return r['text_type']

comments_df['text_type'] = comments_df.apply(add_parent_type, axis=1)
comments_df = comments_df[comments_df['text_type'].isin(['pill', 'iud', 'implant'])]

len(comments_df.index)

373237

In [418]:
309366/492713

0.6278827633936997

In [419]:
# for i, r in comments_df[comments_df['text_type'] == 'implant'].sample(10).iterrows():
#     print(r['body'])
#     print()
#     print('============================================')
#     print()

In [420]:
id_type_dict = {r['id']: r['text_type'] for i, r in comments_df.iterrows()}
pickle.dump(id_type_dict, open(data_directory_path + '/labeling/reddit_comments.id_type_dict.pickle', 'wb'))

### Get the year and month and remove comments whose years cannot be found.

In [421]:
comments_df['year'] = comments_df['created_utc'].apply(get_year)
comments_df['month'] = comments_df['created_utc'].apply(get_month)
comments_df = comments_df[(comments_df['year'] != 'Unknown') & (comments_df['year'] != 1970)]
len(comments_df.index)

373237

In [422]:
comments_df['year'].value_counts()

2020    105482
2019     86917
2018     59785
2017     41683
2016     27269
2015     21118
2014     15477
2013      9771
2012      5254
2011       481
Name: year, dtype: int64

### Remove comments by the OP, stickied comments, and comments where the user was removed.

In [423]:
comments_df = comments_df[(comments_df['is_submitter']) != True & (comments_df['stickied'] == False) & (comments_df['user_removed'] != False)]
len(comments_df.index)

315370

### Remove comments without a date

In [424]:
comments_df = comments_df.dropna(subset=['created_utc'])
len(comments_df.index)

315370

### Remove short comments.

In [425]:
def get_num_tokens(text):
    if pd.isnull(text):
        return 0
    return len(text.split())

comments_df['num_tokens'] = comments_df['body'].apply(get_num_tokens)

In [426]:
# The token length requirement also handles deleted comments, which are replaced with the single "[deleted]" token.
comments_df = comments_df[(comments_df['num_tokens'] >= 3)]
len(comments_df.index)

302559

### Remove duplicate comments

In [427]:
comments_df = comments_df.drop_duplicates(subset='body')
len(comments_df)

300567

### Drop all the unnecessary columns, rename columns, add source column

In [428]:
comments_df = comments_df[['id', 'parent_id', 'created_utc', 'body', 'tokens_text', 'text_type', 'year', 'month']]
comments_df = comments_df.rename(columns={'body': 'text'})
comments_df['source'] = 'reddit-comments'

In [429]:
comments_df['year'].value_counts()

2020    76660
2019    68726
2018    44003
2017    37097
2016    26172
2015    19919
2014    14115
2013     8740
2012     4707
2011      428
Name: year, dtype: int64

### Remove November-December 2020 to match WebMD

In [430]:
comments_df['year'].value_counts()

2020    76660
2019    68726
2018    44003
2017    37097
2016    26172
2015    19919
2014    14115
2013     8740
2012     4707
2011      428
Name: year, dtype: int64

In [431]:
comments_df = comments_df[~((comments_df['year'] == 2020) & (comments_df['month'].isin([11, 12])))]
len(comments_df.index)

287279

In [432]:
comments_df['year'].value_counts()

2019    68726
2020    63372
2018    44003
2017    37097
2016    26172
2015    19919
2014    14115
2013     8740
2012     4707
2011      428
Name: year, dtype: int64

### Final dataframe

In [433]:
len(comments_df.index)

287279

In [434]:
comments_df.sample(3)

Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
3728,do0l1wi,t3_74qoad,1507334645,An ultra sound or x-ray will see it as other p...,ultra sound x ray see people said gynecologica...,iud,2017,10,reddit-comments
5031,f7hqusl,t3_dw66ti,1573745851,I'm on Alesse and have been for several years....,m alesse several years experienced little naus...,pill,2019,11,reddit-comments
108,coxavru,t1_coxaru7,1424911433,Your old doctor's office can give you your med...,old doctor office give medical records send ne...,pill,2015,2,reddit-comments


In [435]:
comments_df['text_type'].value_counts()

pill       127585
iud        125158
implant     34536
Name: text_type, dtype: int64

In [436]:
comments_df['year'].value_counts()

2019    68726
2020    63372
2018    44003
2017    37097
2016    26172
2015    19919
2014    14115
2013     8740
2012     4707
2011      428
Name: year, dtype: int64

In [437]:
comments_df.to_csv(data_directory_path + '/final-data/reddit_comments.csv')

In [438]:
comments_df.sample(3)

Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
5200,ewvrdye,t3_cq95wj,1565812355,I just had my first one done today. I got the ...,first one done today got mirena kids reference...,iud,2019,8,reddit-comments
330,fixgg58,t1_fixduzi,1582819946,Yep!\n\nA bleeding can be considered a period ...,yep bleeding considered period part natural cy...,implant,2020,2,reddit-comments
11800,g3mc6ak,t1_g3mbu99,1598983859,I’d say after like 3 weeks of your missed peri...,d say like NUM weeks missed period taken pregn...,iud,2020,9,reddit-comments


In [439]:
target_df = comments_df[comments_df['parent_type'] != 'unknown']
for i, r in target_df[target_df['text_type'] != target_df['parent_type']].sample(10).iterrows():
    print('Text Type:', r['text_type'])
    print('Parent Type:', r['parent_type'])
    print(' '.join(r['text'].split()))
    print()

KeyError: 'parent_type'

In [None]:
target_df = comments_df[comments_df['text_type'] == 'unknown']
for i, r in target_df.sample(10).iterrows():
    print('Text Type:', r['text_type'])
    print('Parent Type:', r['parent_type'])
    print(' '.join(r['text'].split()))
    print()

ValueError: a must be greater than 0 unless no samples are taken