In [1]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)

import little_mallet_wrapper as lmw

In [2]:
scraped_directory_path = '/Volumes/Passport-1/data/birth-control/reddit/scraped'
data_directory_path   = '/Volumes/Passport-1/data/birth-control'

<br><br><br><br>

# Extract pill keywords from WebMD

In [73]:
webmd_pill_keywords = ['sprintec', 'yaz', 'tri-sprintec', 'loestrin', 'lo-loestrin-fe', 'trinessa', 'junel-fe', 'generess-fe', 'mononessa', 'seasonique', 'azurette', 
                       'minastrin', 'beyaz', 'ocella', 'lutera', 'ortho-tri-cyclen-lo', 'microgestin-fe', 'loryna', 'errin', 'gildess-fe', 'sronyx', 'necon', 
                       'tri-previfem', 'cryselle', 'yasmin', 'viorele', 'jolivette', 'aviane', 'natazia', 'reclipsen', 'apri', 'ortho-tri-cyclen', 
                       'femcon-fe-tablet-chewable', 'camila', 'jolessa', 'amethia', 'kariva', 'nora-be', 'quasense', 'ortho-cyclen', 'junel-fe', 'gianvi', 'low-ogestrel', 
                       'microgestin-fe', 'lybrel', 'enpresse', 'ortho-micronor', 'tri-estarylla', 'loseasonique', 'trivora', 'balziva', 'alesse', 'aubra', 'vestura', 
                       'lo-ovral', 'nortrel', 'camrese', 'levora', 'portia', 'chateal', 'zovia', 'orsythia', 'tri-lo-sprintec', 'nortrel', 'levora', 'mono-linyah', 
                       'microgestin', 'junel', 'norethindrone', 'previfem', 'nikki', 'syeda', 'lessina', 'zenchent', 'tarina-fe', 'loestrin-fe', 'falmina', 'tri-linyah', 
                       'tilia-fe', 'vienva', 'introvale', 'lomedia', 'desogen-tablet', 'norgestimate-ethinyl-estradiol', 'emoquette', 'alyacen', 'quartette', 'altavera', 
                       'daysee', 'kelnor', 'necon-triphasic', 'amethyst', 'nor-q-d-tablet', 'junel', 'mircette', 'safyral', 'caziant', 'micronor', 'enskyce', 'estarylla', 
                       'heather', 'tri-legest-fe', 'zarah', 'nortrel-triphasic', 'levlen', 'amethia-lo', 'ashlyna', 'ortho-tri-cyclen', 'blisovi-fe', 'microgestin', 'lyza', 
                       'camrese-lo', 'loestrin', 'solia-tablet', 'kurvelo', 'levonorgestrel-ec', 'ortho-novum', 'ortho-novum-triphasic', 'cyclafem', 'myzilra', 'necon', 
                       'norinyl', 'estrostep-fe', 'velivet', 'ovcon', 'cyclessa', 'ovcon', 'marlissa', 'sharobel', 'ortho-cept', 'ortho-cyclen', 'loestrin-fe', 
                       'zeosa-tablet-chewable', 'pimtrea', 'gildess', 'dasetta', 'necon', 'norgestrel-ethiny-estra', 'leena', 'larin-fe', 'triphasil', 'levora', 'larissia', 
                       'loestrin', 'nordette', 'desogestrel-ethinyl-estradiol', 'vyfemla', 'zenchent-fe', 'ogestrel', 'low-ogestrel', 'necon', 'norethindron-ethinyl-estradiol-tablet-contraceptives', 
                       'nortrel', 'demulen', 'dasetta-triphasic', 'tri-norinyl', 'tri-levlen', 'blisovi', 'zovia', 'isibloom', 'alesse']
webmd_pill_keywords += [n.replace('-', ' ') for n in webmd_pill_keywords if '-' in n]

', '.join(["'" + n + "'" for n in webmd_pill_keywords])

"'sprintec', 'yaz', 'tri-sprintec', 'loestrin', 'lo-loestrin-fe', 'trinessa', 'junel-fe', 'generess-fe', 'mononessa', 'seasonique', 'azurette', 'minastrin', 'beyaz', 'ocella', 'lutera', 'ortho-tri-cyclen-lo', 'microgestin-fe', 'loryna', 'errin', 'gildess-fe', 'sronyx', 'necon', 'tri-previfem', 'cryselle', 'yasmin', 'viorele', 'jolivette', 'aviane', 'natazia', 'reclipsen', 'apri', 'ortho-tri-cyclen', 'femcon-fe-tablet-chewable', 'camila', 'jolessa', 'amethia', 'kariva', 'nora-be', 'quasense', 'ortho-cyclen', 'junel-fe', 'gianvi', 'low-ogestrel', 'microgestin-fe', 'lybrel', 'enpresse', 'ortho-micronor', 'tri-estarylla', 'loseasonique', 'trivora', 'balziva', 'alesse', 'aubra', 'vestura', 'lo-ovral', 'nortrel', 'camrese', 'levora', 'portia', 'chateal', 'zovia', 'orsythia', 'tri-lo-sprintec', 'nortrel', 'levora', 'mono-linyah', 'microgestin', 'junel', 'norethindrone', 'previfem', 'nikki', 'syeda', 'lessina', 'zenchent', 'tarina-fe', 'loestrin-fe', 'falmina', 'tri-linyah', 'tilia-fe', 'vienv

In [72]:
remaining_reddit_pill_keywords = []
for _keyword in type_keywords_dict['pill']:
    if _keyword not in webmd_pill_keywords:
        remaining_reddit_pill_keywords.append(_keyword)
', '.join(["'" + n + "'" for n in remaining_reddit_pill_keywords])

"'oral contracept', 'oral birth control', 'pill', 'pills', 'minipill', 'levonorgestrel', 'femcon', 'desogestrel', 'larin', 'norgestimate', 'zeosa', 'tilia', 'desogen', 'nor', 'generess', 'norgestrel', 'estrostep', 'tarina', 'solia', 'loestren', 'loloestrin', 'gedarel', 'pack', 'placebo', 'tri cyclen', 'tricyclen', 'linessa', 'taytulla', 'lolo', 'tri jordyna', 'trijordyna'"

<br><br><br><br>

# Final keywords

In [295]:
type_keywords_dict = {'iud': ['iud', 'mirena', 'skyla', 'liletta', 'paragard', 'paraguard', 'kyleena', 'copper', 'coil'], 
                      'implant': ['implanon', 'nexplanon', 'implant', 'norplant'],
                      'ring': ['nuvaring', 'etonogestrel', 'ring', 'nuva ring'], 
                      'shot': ['shot',  'inject', 'injection', 'depo', 'provera', 'depoprovera'], 
                      'patch': ['ortho evra', 'xulane', 'patch'],
                      'pill': ['oral contracept', 'oral birth control', 'pill', 'pills', 'minipill', 'levonorgestrel', 'femcon', 'desogestrel', 'larin', 'norgestimate', 
                               'zeosa', 'tilia', 'desogen', 'nor', 'generess', 'norgestrel', 'estrostep', 'tarina', 'solia', 'loestren', 'loloestrin', 'gedarel', 'pack', 
                               'placebo', 'tri cyclen', 'tricyclen', 'linessa', 'taytulla', 'lolo', 'tri jordyna', 'trijordyna',
                               'sprintec', 'yaz', 'tri-sprintec', 'loestrin', 'lo-loestrin-fe', 'trinessa', 'junel-fe', 'generess-fe', 'mononessa', 'seasonique', 'azurette', 
                               'minastrin', 'beyaz', 'ocella', 'lutera', 'ortho-tri-cyclen-lo', 'microgestin-fe', 'loryna', 'errin', 'gildess-fe', 'sronyx', 'necon', 
                               'tri-previfem', 'cryselle', 'yasmin', 'viorele', 'jolivette', 'aviane', 'natazia', 'reclipsen', 'apri', 'ortho-tri-cyclen', 'femcon-fe-tablet-chewable', 
                               'camila', 'jolessa', 'amethia', 'kariva', 'nora-be', 'quasense', 'ortho-cyclen', 'junel-fe', 'gianvi', 'low-ogestrel', 'microgestin-fe', 
                               'lybrel', 'enpresse', 'ortho-micronor', 'tri-estarylla', 'loseasonique', 'trivora', 'balziva', 'alesse', 'aubra', 'vestura', 'lo-ovral', 'nortrel', 
                               'camrese', 'levora', 'portia', 'chateal', 'zovia', 'orsythia', 'tri-lo-sprintec', 'nortrel', 'levora', 'mono-linyah', 'microgestin', 'junel', 
                               'norethindrone', 'previfem', 'nikki', 'syeda', 'lessina', 'zenchent', 'tarina-fe', 'loestrin-fe', 'falmina', 'tri-linyah', 'tilia-fe', 'vienva', 
                               'introvale', 'lomedia', 'desogen-tablet', 'norgestimate-ethinyl-estradiol', 'emoquette', 'alyacen', 'quartette', 'altavera', 'daysee', 'kelnor', 
                               'necon-triphasic', 'amethyst', 'nor-q-d-tablet', 'junel', 'mircette', 'safyral', 'caziant', 'micronor', 'enskyce', 'estarylla', 'heather', 
                               'tri-legest-fe', 'zarah', 'nortrel-triphasic', 'levlen', 'amethia-lo', 'ashlyna', 'ortho-tri-cyclen', 'blisovi-fe', 'microgestin', 'lyza', 'camrese-lo', 
                               'loestrin', 'solia-tablet', 'kurvelo', 'levonorgestrel-ec', 'ortho-novum', 'ortho-novum-triphasic', 'cyclafem', 'myzilra', 'necon', 'norinyl', 
                               'estrostep-fe', 'velivet', 'ovcon', 'cyclessa', 'ovcon', 'marlissa', 'sharobel', 'ortho-cept', 'ortho-cyclen', 'loestrin-fe', 'zeosa-tablet-chewable', 
                               'pimtrea', 'gildess', 'dasetta', 'necon', 'norgestrel-ethiny-estra', 'leena', 'larin-fe', 'triphasil', 'levora', 'larissia', 'loestrin', 'nordette', 
                               'desogestrel-ethinyl-estradiol', 'vyfemla', 'zenchent-fe', 'ogestrel', 'low-ogestrel', 'necon', 'norethindron-ethinyl-estradiol-tablet-contraceptives', 
                               'nortrel', 'demulen', 'dasetta-triphasic', 'tri-norinyl', 'tri-levlen', 'blisovi', 'zovia', 'isibloom', 'alesse', 'tri sprintec', 'lo loestrin fe', 
                               'junel fe', 'generess fe', 'ortho tri cyclen lo', 'microgestin fe', 'gildess fe', 'tri previfem', 'ortho tri cyclen', 'femcon fe tablet chewable', 
                               'nora be', 'ortho cyclen', 'junel fe', 'low ogestrel', 'microgestin fe', 'ortho micronor', 'tri estarylla', 'lo ovral', 'tri lo sprintec', 'mono linyah', 
                               'tarina fe', 'loestrin fe', 'tri linyah', 'tilia fe', 'desogen tablet', 'norgestimate ethinyl estradiol', 'necon triphasic', 'nor q d tablet', 
                               'tri legest fe', 'nortrel triphasic', 'amethia lo', 'ortho tri cyclen', 'blisovi fe', 'camrese lo', 'solia tablet', 'levonorgestrel ec', 'ortho novum', 
                               'ortho novum triphasic', 'estrostep fe', 'ortho cept', 'ortho cyclen', 'loestrin fe', 'zeosa tablet chewable', 'norgestrel ethiny estra', 'larin fe', 
                               'desogestrel ethinyl estradiol', 'zenchent fe', 'low ogestrel', 'norethindron ethinyl estradiol tablet contraceptives', 'dasetta triphasic', 'tri norinyl', 'tri levlen'],
                     'emergency': ['plan b', 'emergency contraception', 'morning after', 'morningafter', 'norlevo'],
                     'barrier': ['condom', 'condoms', 'diaphragm', 'diaphram', 'barrier', 'spermicide', 'cap', 'sponge', 'vcf', 'encare', 'conceptrol', 'foam', 'film'],
                     'sterilization': ['sterilize', 'sterilization', 'sterilise', 'sterilisation', 'tubes tied', 'tie my tubes', 'vasectomy'],
                     'withdrawal': ['withdrawal', 'withdraw', 'pull out'],
                     'ring': ['ring', 'nuvaring'],
                     'periodic abstinence': ['rhythm method', 'natural family planning', 'nfp', 'fam', 'fertility awareness', 
                                             'symptothermal', 'sympto-thermal', 'sympto thermal', 'basal body', 'natural cycles',
                                             'calendar method', 'bbt', 'cervical mucus method', 'kindara', 'fertility tracking']}

for _type in type_keywords_dict.keys():
    type_keywords_dict[_type] = list(set(type_keywords_dict[_type]))

for _type, _words in type_keywords_dict.items():
    assert len(_words) == len(list(set(_words)))
  

In [299]:
for _type, _keywords in type_keywords_dict.items():
    print(_type)
    print(', '.join(sorted(_keywords)))
    print()

iud
coil, copper, iud, kyleena, liletta, mirena, paragard, paraguard, skyla

implant
implanon, implant, nexplanon, norplant

ring
nuvaring, ring

shot
depo, depoprovera, inject, injection, provera, shot

patch
ortho evra, patch, xulane

pill
alesse, altavera, alyacen, amethia, amethia lo, amethia-lo, amethyst, apri, ashlyna, aubra, aviane, azurette, balziva, beyaz, blisovi, blisovi fe, blisovi-fe, camila, camrese, camrese lo, camrese-lo, caziant, chateal, cryselle, cyclafem, cyclessa, dasetta, dasetta triphasic, dasetta-triphasic, daysee, demulen, desogen, desogen tablet, desogen-tablet, desogestrel, desogestrel ethinyl estradiol, desogestrel-ethinyl-estradiol, emoquette, enpresse, enskyce, errin, estarylla, estrostep, estrostep fe, estrostep-fe, falmina, femcon, femcon fe tablet chewable, femcon-fe-tablet-chewable, gedarel, generess, generess fe, generess-fe, gianvi, gildess, gildess fe, gildess-fe, heather, introvale, isibloom, jolessa, jolivette, junel, junel fe, junel-fe, kariva, k

<br><br>

# Functions

In [507]:
def get_type_from_post(r):

    test_tokens = r['tokens_text'].split()

    # Count how many times each type appears in the text
    type_count_dict = defaultdict(int)
    for _type, _keywords in type_keywords_dict.items():
        for _word in _keywords:
            if len(_word.split()) == 1:
                type_count_dict[_type] += len([t for t in test_tokens if t == _word])
            elif len(_word.split()) > 1:
                type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

    # Get the maximum number of times any type appears in the text
    max_count = max(type_count_dict.values())

    # If there were any type mentions, get the types with the most mentions and randomly return one of them (to break ties)
    if max_count != 0:
        max_keys = {k for k, v in type_count_dict.items() if v == max_count}
        if len(max_keys) > 1:
            return 'combo' + '_' + '_'.join(sorted(list(max_keys)))
        return list(max_keys)[0]
        # return random.sample(max_keys, 1)[0]

    # If there were no type mentions at all, return this
    return 'unknown'


# def get_all_types_from_post(r):
    
#     test_tokens = r['tokens_text'].split()

#     # Count how many times each type appears in the text
#     type_count_dict = defaultdict(int)
#     for _type, _keywords in type_keywords_dict.items():
#         for _word in _keywords:
#             if len(_word.split()) == 1:
#                 type_count_dict[_type] += len([t for t in test_tokens if t == _word])
#             elif len(_word.split()) > 1:
#                 type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

#     if len(type_count_dict) > 0:
#         return list(set([_type for _type, _count in type_count_dict.items() if _count > 0]))

#     # If there were no type mentions at all, return this
#     return 'unknown'

In [540]:
def get_type_from_comment(r):

    test_tokens = str(r['tokens_text']).split()

    # Count how many times each type appears in the text
    type_count_dict = defaultdict(int)
    for _type, _keywords in type_keywords_dict.items():
        for _word in _keywords:
            if len(_word.split()) == 1:
                type_count_dict[_type] += len([t for t in test_tokens if t == _word])
            elif len(_word.split()) > 1:
                type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

    # Get the maximum number of times any type appears in the text 
    max_count = max(type_count_dict.values())
    if max_count != 0:
        max_keys = {k for k, v in type_count_dict.items() if v == max_count}
        if len(max_keys) > 1:
            return 'combo' + '_' + '_'.join(sorted(list(max_keys)))
        return list(max_keys)[0]

    # If no assignment, then assign to parent post type
    # elif len(str(r['parent_id']).split('_')) > 1:
    #     parent_id = str(r['parent_id']).split('_')[1] 
    #     if parent_id in post_type_dict:
    #         return post_type_dict[parent_id]
    # elif str(r['id']) in comment_parent_dict and comment_parent_dict[str(r['id'])] in post_type_dict:
    #     return post_type_dict[comment_parent_dict[str(r['id'])]]

    # If there were no type mentions at all, and the parent post couldn't be found, return this 
    return 'unknown'


def get_parent_type(r):

    if str(r['id']) in comment_parent_dict and comment_parent_dict[str(r['id'])] in post_type_dict:
        return post_type_dict[comment_parent_dict[str(r['id'])]]

    return 'unknown'


# def get_all_types_from_comment(r):

#     test_tokens = str(r['tokens_text']).split()

#     # Count how many times each type appears in the text
#     type_count_dict = defaultdict(int)
#     for _type, _keywords in type_keywords_dict.items():
#         for _word in _keywords:
#             if len(_word.split()) == 1:
#                 type_count_dict[_type] += len([t for t in test_tokens if t == _word])
#             elif len(_word.split()) > 1:
#                 type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

#     # If found any types, return them
#     if len(type_count_dict) > 0:
#         return list(set([_type for _type, _count in type_count_dict.items() if _count > 0]))

#     # If there were no type mentions at all, and the parent post couldn't be found, return this 
#     return 'unknown'

In [78]:
def get_year(x):
    try:
        return datetime.utcfromtimestamp(int(x)).year
    except:
        return 'Unknown'

In [79]:
def get_month(x):
    try:
        return datetime.utcfromtimestamp(int(x)).month
    except:
        return 'Unknown'

In [80]:
def get_tokens(r):
    if 'title' in r:
        return lmw.process_string(str(r['title']), remove_short_words=False) + ' ' + lmw.process_string(str(r['selftext']), remove_short_words=False)
    return lmw.process_string(str(r['selftext']), remove_short_words=False)

def get_tokens_from_comment(r):
    return lmw.process_string(str(r['body']), remove_short_words=False)

<br><br>

# **Process posts**

### Load scraped posts

In [511]:
posts_df_list = []
for _subdir, _dirs, _files in os.walk(scraped_directory_path + '/posts'):
    for _file_name in _files:
        if _file_name.endswith('.csv'):
            posts_df_list.append(pd.read_csv(_subdir + '/' + _file_name))

posts_df = pd.concat(posts_df_list)
len(posts_df.index)

94153

### Get tokenized text

In [512]:
posts_df['selftext'] = posts_df['selftext'].astype(str)
posts_df['tokens_text'] = posts_df.apply(get_tokens, axis=1)

In [484]:
# word_count_dict = defaultdict(int)

# for i, r in posts_df.iterrows():

#     test_tokens = r['tokens_text'].split()

#     # Count how many times each type appears in the text
#     for _word in type_keywords_dict['pill']:
#         if len(_word.split()) == 1:
#             word_count_dict[_word] += len([t for t in test_tokens if t == _word])
#         elif len(_word.split()) > 1:
#             word_count_dict[_word] += len(re.findall(_word, ' '.join(test_tokens)))

# found_words = []
# for _word, _count in sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True):
#     if _count > 0:
#         found_words.append(_word)
# ', '.join(sorted(found_words))

In [496]:
# test_tokens = lmw.process_string('pill something').split()

# # Count how many times each type appears in the text
# type_count_dict = defaultdict(int)
# for _type, _keywords in type_keywords_dict.items():
#     for _word in _keywords:
#         if len(_word.split()) == 1:
#             type_count_dict[_type] += len([t for t in test_tokens if t == _word])
#         elif len(_word.split()) > 1:
#             type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

# # Get the maximum number of times any type appears in the text
# max_count = max(type_count_dict.values())

# # If there were any type mentions, get the types with the most mentions and randomly return one of them (to break ties)
# if max_count != 0:
#     max_keys = {k for k, v in type_count_dict.items() if v == max_count}
#     if len(max_keys) > 1:
#         print('combo')
#     print(list(max_keys)[0])
#     # return random.sample(max_keys, 1)[0]

# # If there were no type mentions at all, return this
# print('unknown')

### Get birth control type from post text, dropping posts that don't have a type or that aren't in our three target types

In [513]:
posts_df['text_type'] = posts_df.apply(get_type_from_post, axis=1)

In [516]:
pd.set_option("display.max_rows", None)
posts_df['text_type'].value_counts()

pill                                                        39217
iud                                                         26274
implant                                                      7782
unknown                                                      4310
shot                                                         3191
ring                                                         2355
barrier                                                      2329
emergency                                                    1644
patch                                                        1304
combo_iud_pill                                               1020
combo_barrier_pill                                            686
combo_implant_pill                                            525
combo_emergency_pill                                          343
combo_implant_iud                                             326
withdrawal                                                    235
combo_pill

In [518]:
posts_df['text_type'].value_counts(normalize=True)

pill                                                        0.416524
iud                                                         0.279056
implant                                                     0.082653
unknown                                                     0.045777
shot                                                        0.033892
ring                                                        0.025012
barrier                                                     0.024736
emergency                                                   0.017461
patch                                                       0.013850
combo_iud_pill                                              0.010833
combo_barrier_pill                                          0.007286
combo_implant_pill                                          0.005576
combo_emergency_pill                                        0.003643
combo_implant_iud                                           0.003462
withdrawal                        

In [519]:
len(posts_df.index)

94153

In [520]:
4367/94153

0.04638195277898739

In [521]:
post_type_dict = {str(r['id']): r['text_type'] for i, r in posts_df.iterrows()}
# pickle.dump(post_type_dict, open(data_directory_path + '/labeling/reddit_posts.id_type_dict.pickle', 'wb'))

In [522]:
posts_df = posts_df[posts_df['text_type'].isin(['pill', 'iud', 'implant'])]
len(posts_df.index)

73273

In [523]:
posts_df['text_type'].value_counts()

pill       39217
iud        26274
implant     7782
Name: text_type, dtype: int64

In [524]:
76365/94153

0.8110734655295104

### Get year and month and remove comments whose years can't be found

In [525]:
posts_df['year'] = posts_df['created_utc'].apply(get_year)
posts_df['month'] = posts_df['created_utc'].apply(get_month)
posts_df = posts_df[(posts_df['year'] != 'Unknown') & (posts_df['year'] != 1970)]
len(posts_df.index)

73273

In [526]:
# for i, r in posts_df[posts_df['text_type'] == 'implant'].sample(10).iterrows():
#     print(r['title'])
#     print()
#     print(r['text'])
#     print()
#     print('============================================')
#     print()

In [527]:
posts_df['year'].value_counts()

2020    23981
2019    18551
2018    11060
2017     6619
2016     4356
2015     3443
2014     2495
2013     1699
2012      948
2011      121
Name: year, dtype: int64

### Remove short posts

In [528]:
def get_num_tokens(text):
    if pd.isnull(text):
        return 0
    return len(text.split())

posts_df['num_tokens'] = posts_df['selftext'].apply(get_num_tokens)

In [529]:
# The token length requirement also handles deleted comments, which are replaced with the single "[deleted]" token.
posts_df = posts_df[(posts_df['num_tokens'] >= 3)]
len(posts_df.index)

73209

### Remove duplicate posts

In [530]:
posts_df = posts_df.drop_duplicates(subset='selftext')
len(posts_df)

72898

### Remove unnecessary columns, rename columns

In [531]:
posts_df = posts_df[['id', 'created_utc', 'selftext', 'title', 'year', 'month', 'url', 'link_flair_text', 'tokens_text', 'text_type']]
posts_df = posts_df.rename(columns={'selftext': 'text'})
posts_df['source'] = 'reddit-posts'

### Remove November-December 2020 to match WebMD

In [532]:
posts_df['year'].value_counts()

2020    23772
2019    18460
2018    11010
2017     6612
2016     4355
2015     3441
2014     2492
2013     1692
2012      944
2011      120
Name: year, dtype: int64

In [533]:
posts_df = posts_df[~((posts_df['year'] == 2020) & (posts_df['month'].isin([11, 12])))]
len(posts_df.index)

68958

In [534]:
posts_df['year'].value_counts()

2020    19832
2019    18460
2018    11010
2017     6612
2016     4355
2015     3441
2014     2492
2013     1692
2012      944
2011      120
Name: year, dtype: int64

### Final dataframe

In [535]:
len(posts_df.index)

68958

In [536]:
posts_df['text_type'].value_counts()

pill       36921
iud        24657
implant     7380
Name: text_type, dtype: int64

In [537]:
posts_df['year'].value_counts()

2020    19832
2019    18460
2018    11010
2017     6612
2016     4355
2015     3441
2014     2492
2013     1692
2012      944
2011      120
Name: year, dtype: int64

In [538]:
posts_df.sample(3)

Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,text_type,source
711,9odah6,1539614664,"I usually have a really normal period, and it’...",Spotting with Paraguard IUD,2018,10,https://www.reddit.com/r/birthcontrol/comments...,Mistake or Risk?,spotting paraguard iud usually really normal p...,iud,reddit-posts
88,b6zlql,1553877994,I got my Kyleena inserted yesterday the insert...,Questions and support with Kyleena,2019,3,https://www.reddit.com/r/birthcontrol/comments...,Side Effects!?,questions support kyleena got kyleena inserted...,iud,reddit-posts
584,7510ty,1507459253,Im using 21 days pills and was wondering if sh...,Can I shorten my pill free week safely?,2017,10,https://www.reddit.com/r/birthcontrol/comments...,How to?,shorten pill free week safely im using NUM day...,pill,reddit-posts


In [539]:
posts_df.to_csv(data_directory_path + '/final-data/reddit_posts.csv')

<br><br>

# Process comments

### Load the scraped comments

In [541]:
comments_df_list = []
for _subdir, _dirs, _files in os.walk(scraped_directory_path + '/comments'):
    for _file_name in _files:
        if _file_name.endswith('.csv'):
            comments_df_list.append(pd.read_csv(_subdir + '/' + _file_name))

comments_df = pd.concat(comments_df_list)
len(comments_df.index)

492713

### Tokenize text

In [542]:
comments_df['body'] = comments_df['body'].astype(str)
comments_df['tokens_text'] = comments_df.apply(get_tokens_from_comment, axis=1)

### Trace parent tree

In [543]:
comment_parent_dict = {}
for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type == 't3':
            comment_parent_dict[str(r['id'])] = _parent

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

for i, r in comments_df.iterrows():
    if len(str(r['parent_id']).split('_')) > 1:
        _type, _parent = str(r['parent_id']).split('_')
        if _type != 't3' and _parent in comment_parent_dict:
            comment_parent_dict[str(r['id'])] = comment_parent_dict[_parent]

print(len(comment_parent_dict))

251584
365373
425784
454244
469090
476854
481292
483858
485462
486484
487195
487683


### Get the birth control type from either the comment or the parent post (drop if neither method works)

In [544]:
# # for i, r in comments_df[comments_df['text_type'] == 'unknown'].sample(10).iterrows():

# # print(' '.join(r['body'].split()))

# # test_tokens = str(r['tokens_text']).split()

# test_tokens = 'Gotcha! Well in that case I say try out the Nexplanon. You also may be able to get your option through Planned Parenthood or Nurx. They may be able to supply you with enough to last you the entire year if you want to try a pill, ring, or patch. But since hormones are not an issue and if you still have insurance that will cover it, I say for for the Nexplanon. If that doesn\'t work, try the other options I listed. I hope you find something that works for you!'
# test_tokens = lmw.process_string(test_tokens).split()
# print(test_tokens)

# # Count how many times each type appears in the text
# type_count_dict = defaultdict(int)
# for _type, _keywords in type_keywords_dict.items():
#     for _word in _keywords:
#         if len(_word.split()) == 1:
#             type_count_dict[_type] += len([t for t in test_tokens if t == _word])
#         elif len(_word.split()) > 1:
#             type_count_dict[_type] += len(re.findall(_word, ' '.join(test_tokens)))

# print(type_count_dict)

# # Get the maximum number of times any type appears in the text 
# max_count = max(type_count_dict.values())
# if max_count != 0:
#     max_keys = {k for k, v in type_count_dict.items() if v == max_count}
#     print(random.sample(max_keys, 1)[0])

# # If no assignment, then assign to parent post type
# # elif len(str(r['parent_id']).split('_')) > 1:
# #     print(r['parent_id'])
# #     print(r['id'])
# #     parent_id = str(r['parent_id']).split('_')[1] 
# #     if parent_id in post_type_dict:
# #         print(post_type_dict[parent_id])

# # If there were no type mentions at all, and the parent post couldn't be found, return this 
# print('unknown')

# print()

In [545]:
comments_df['text_type'] = comments_df.apply(get_type_from_comment, axis=1)

In [546]:
comments_df['parent_type'] = comments_df.apply(get_parent_type, axis=1)

In [547]:
comments_df['text_type'].value_counts()

unknown                                                               230488
pill                                                                   91705
iud                                                                    80701
implant                                                                19275
barrier                                                                11455
shot                                                                    8673
emergency                                                               6697
combo_iud_pill                                                          6087
ring                                                                    5416
combo_implant_pill                                                      2667
combo_implant_iud                                                       2506
patch                                                                   2285
withdrawal                                                              2239

In [548]:
comments_df['text_type'].value_counts(normalize=True)

unknown                                                               0.467794
pill                                                                  0.186123
iud                                                                   0.163789
implant                                                               0.039120
barrier                                                               0.023249
shot                                                                  0.017603
emergency                                                             0.013592
combo_iud_pill                                                        0.012354
ring                                                                  0.010992
combo_implant_pill                                                    0.005413
combo_implant_iud                                                     0.005086
patch                                                                 0.004638
withdrawal                                          

In [549]:
comments_df['parent_type'].value_counts()

pill                                                        153596
iud                                                         146570
unknown                                                      66614
implant                                                      38325
barrier                                                      17397
shot                                                         13862
ring                                                          9733
emergency                                                     8454
combo_iud_pill                                                5823
combo_barrier_pill                                            4020
patch                                                         4006
combo_implant_pill                                            2688
combo_implant_iud                                             1895
combo_emergency_pill                                          1821
sterilization                                                 

In [550]:
comments_df['parent_type'].value_counts(normalize=True)

pill                                                        0.311735
iud                                                         0.297475
unknown                                                     0.135198
implant                                                     0.077784
barrier                                                     0.035309
shot                                                        0.028134
ring                                                        0.019754
emergency                                                   0.017158
combo_iud_pill                                              0.011818
combo_barrier_pill                                          0.008159
patch                                                       0.008130
combo_implant_pill                                          0.005456
combo_implant_iud                                           0.003846
combo_emergency_pill                                        0.003696
sterilization                     

In [551]:
len(comments_df.index)

492713

In [552]:
# comments_df = comments_df[(comments_df['text_type'].isin(['pill', 'iud', 'implant'])) | ((comments_df['text_type'] == 'unknown') & (comments_df['parent_type'].isin(['pill', 'iud', 'implant'])))]

def add_parent_type(r):
    if r['text_type'] == 'unknown':
        return r['parent_type']
    return r['text_type']

comments_df['text_type'] = comments_df.apply(add_parent_type, axis=1)
comments_df = comments_df[comments_df['text_type'].isin(['pill', 'iud', 'implant'])]

len(comments_df.index)

345371

In [553]:
309366/492713

0.6278827633936997

In [554]:
# for i, r in comments_df[comments_df['text_type'] == 'implant'].sample(10).iterrows():
#     print(r['body'])
#     print()
#     print('============================================')
#     print()

In [555]:
id_type_dict = {r['id']: r['text_type'] for i, r in comments_df.iterrows()}
pickle.dump(id_type_dict, open(data_directory_path + '/labeling/reddit_comments.id_type_dict.pickle', 'wb'))

### Get the year and month and remove comments whose years cannot be found.

In [556]:
comments_df['year'] = comments_df['created_utc'].apply(get_year)
comments_df['month'] = comments_df['created_utc'].apply(get_month)
comments_df = comments_df[(comments_df['year'] != 'Unknown') & (comments_df['year'] != 1970)]
len(comments_df.index)

345371

In [557]:
comments_df['year'].value_counts()

2020    97970
2019    80610
2018    55147
2017    38356
2016    25015
2015    19588
2014    14310
2013     9048
2012     4889
2011      438
Name: year, dtype: int64

### Remove comments by the OP, stickied comments, and comments where the user was removed.

In [558]:
comments_df = comments_df[(comments_df['is_submitter']) != True & (comments_df['stickied'] == False) & (comments_df['user_removed'] != False)]
len(comments_df.index)

291280

### Remove comments without a date

In [559]:
comments_df = comments_df.dropna(subset=['created_utc'])
len(comments_df.index)

291280

### Remove short comments.

In [560]:
def get_num_tokens(text):
    if pd.isnull(text):
        return 0
    return len(text.split())

comments_df['num_tokens'] = comments_df['body'].apply(get_num_tokens)

In [561]:
# The token length requirement also handles deleted comments, which are replaced with the single "[deleted]" token.
comments_df = comments_df[(comments_df['num_tokens'] >= 3)]
len(comments_df.index)

279158

### Remove duplicate comments

In [562]:
comments_df = comments_df.drop_duplicates(subset='body')
len(comments_df)

277299

### Drop all the unnecessary columns, rename columns, add source column

In [563]:
comments_df = comments_df[['id', 'parent_id', 'created_utc', 'body', 'tokens_text', 'text_type', 'year', 'month']]
comments_df = comments_df.rename(columns={'body': 'text'})
comments_df['source'] = 'reddit-comments'

In [564]:
comments_df['year'].value_counts()

2020    70958
2019    63532
2018    40456
2017    34065
2016    23979
2015    18439
2014    13047
2013     8073
2012     4364
2011      386
Name: year, dtype: int64

### Remove November-December 2020 to match WebMD

In [565]:
comments_df['year'].value_counts()

2020    70958
2019    63532
2018    40456
2017    34065
2016    23979
2015    18439
2014    13047
2013     8073
2012     4364
2011      386
Name: year, dtype: int64

In [566]:
comments_df = comments_df[~((comments_df['year'] == 2020) & (comments_df['month'].isin([11, 12])))]
len(comments_df.index)

264912

In [567]:
comments_df['year'].value_counts()

2019    63532
2020    58571
2018    40456
2017    34065
2016    23979
2015    18439
2014    13047
2013     8073
2012     4364
2011      386
Name: year, dtype: int64

### Final dataframe

In [568]:
len(comments_df.index)

264912

In [569]:
comments_df.sample(3)

Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
515,c6kp01d,t1_c6k1nhj,1349920388,"Ah okay, thanks. I did a bit more googling aft...",ah okay thanks bit googling posting question s...,pill,2012,10,reddit-comments
6560,ewkfp2k,t1_ewjrzkr,1565483173,Yeah you definitely want them long. I had a n...,yeah definitely want long nurse practitioner i...,iud,2019,8,reddit-comments
2414,e4k1y52,t3_990i79,1534829918,"From what I've learned, placebo is supposed to...",learned placebo supposed period people sex period,pill,2018,8,reddit-comments


In [570]:
comments_df['text_type'].value_counts()

iud        117631
pill       117283
implant     29998
Name: text_type, dtype: int64

In [571]:
comments_df['year'].value_counts()

2019    63532
2020    58571
2018    40456
2017    34065
2016    23979
2015    18439
2014    13047
2013     8073
2012     4364
2011      386
Name: year, dtype: int64

In [572]:
comments_df.to_csv(data_directory_path + '/final-data/reddit_comments.csv')

In [573]:
comments_df.sample(3)

Unnamed: 0,id,parent_id,created_utc,text,tokens_text,text_type,year,month,source
2453,d525xyl,t1_d524m86,1467851061,"Yeah, I've never heard of getting heavier peri...",yeah never heard getting heavier periods inser...,iud,2016,7,reddit-comments
3341,dyyltim,t3_8jc7k5,1526308931,I'm terrified and I just got mine put in about...,m terrified got mine put half hour ago experie...,pill,2018,5,reddit-comments
5459,dwnj54b,t3_88ucx1,1522630653,I'll start by saying that I am not a medical p...,start saying medical professional read lot ins...,pill,2018,4,reddit-comments


In [574]:
# target_df = comments_df[comments_df['parent_type'] != 'unknown']
# for i, r in target_df[target_df['text_type'] != target_df['parent_type']].sample(10).iterrows():
#     print('Text Type:', r['text_type'])
#     print('Parent Type:', r['parent_type'])
#     print(' '.join(r['text'].split()))
#     print()

In [575]:
# target_df = comments_df[comments_df['text_type'] == 'unknown']
# for i, r in target_df.sample(10).iterrows():
#     print('Text Type:', r['text_type'])
#     print('Parent Type:', r['parent_type'])
#     print(' '.join(r['text'].split()))
#     print()