In [90]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import subprocess
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from scipy.spatial.distance import jensenshannon

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import tomotopy as tp

In [91]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

<br><br>

# Load datasets

In [3]:
reddit_posts_df = pd.read_csv(data_directory_path + '/final-data/reddit_posts.csv')
reddit_comments_df = pd.read_csv(data_directory_path + '/final-data/reddit_comments.csv')
webmd_df = pd.read_csv(data_directory_path + '/final-data/webmd.csv')
twitter_posts_df = pd.read_csv(data_directory_path + '/final-data/twitter_posts.csv')
twitter_replies_df = pd.read_csv(data_directory_path + '/final-data/twitter_replies.csv')

In [4]:
combined_df = pd.concat([reddit_posts_df, reddit_comments_df, twitter_posts_df, twitter_replies_df, webmd_df])
len(combined_df)

1063672

In [5]:
combined_df['source'].value_counts()

twitter-posts      499796
reddit-comments    264912
twitter-replies    211896
reddit-posts        68958
webmd-reviews       18110
Name: source, dtype: int64

In [6]:
combined_df.sample(3)

Unnamed: 0.2,Unnamed: 0,id,created_utc,text,title,year,month,url,link_flair_text,tokens_text,...,date,conversation_id,retweet_count,reply_count,like_count,quote_count,num_tokens,Unnamed: 0.1,Unnamed: 0.1.1,name
45918,68,cxdtem,1567152000.0,This past Wednesday I had my kyleena inserted ...,Thank god i did not read any IUD insertions st...,2019,8.0,https://www.reddit.com/r/birthcontrol/comments...,Experience,thank god read iud insertions stories getting ...,...,,,,,,,,,,
95335,4212,dibvdfu,1496343000.0,"If you are opposed to hormonal BC, then the co...",,2017,6.0,,,opposed hormonal bc copper iud really best opt...,...,,,,,,,,,,
127412,155109,577142315317374976,,Women deserve the full range of repro options....,,2015,3.0,,,women deserve full range repro options iud get...,...,2015-03-15T16:20:25.000Z,5.771423e+17,1.0,0.0,0.0,0.0,14.0,,,


<br><br>

# Sample data by bc type

In [7]:
sampled_twitter_df = pd.concat([twitter_posts_df.groupby('text_type').sample(n=800, random_state=1), 
                                twitter_replies_df.groupby('text_type').sample(n=800, random_state=1)])
sampled_twitter_df.to_csv(data_directory_path + '/final-data/topic-modeling/twitter.ready_for_modeling.csv')
len(sampled_twitter_df)

4800

In [8]:
sampled_reddit_df = pd.concat([reddit_posts_df.groupby('text_type').sample(n=800, random_state=1), 
                               reddit_comments_df.groupby('text_type').sample(n=800, random_state=1)])
sampled_reddit_df.to_csv(data_directory_path + '/final-data/topic-modeling/reddit.ready_for_modeling.csv')
len(sampled_reddit_df)

4800

In [9]:
sampled_webmd_df = webmd_df.groupby('text_type').sample(n=800, random_state=1)
sampled_webmd_df.to_csv(data_directory_path + '/final-data/topic-modeling/webmd.ready_for_modeling.csv')
len(sampled_webmd_df)

2400

<br><br><br><br>

# Train topic models

In [10]:
def prepare_authorless_sampling(topics_output_directory_path, num_topics, input_df):

    training_path = topics_output_directory_path + '/authorless.texts.' + str(num_topics) + '.txt'
    vocab_path = topics_output_directory_path + '/authorless.vocab.' + str(num_topics) + '.txt'

    texts = []
    ids = []
    bc_methods = []
    for i, r in input_df.iterrows():
        texts.append(r['tokens_text'])
        ids.append(r['id'])
        bc_methods.append(r['text_type'])

    word_count_dict = defaultdict(int)
    for t in texts:
        for w in str(t).split():
            word_count_dict[w] += 1
    vocab = list(word_count_dict.keys())

    output_file = open(training_path, 'w')
    for _text, _id, _method in zip(texts, ids, bc_methods):
        output_file.write(str(_id) + '\t' + str(_method) + '\t' + str(_text) + '\n')
    output_file.close()

    output_file = open(vocab_path, 'w')
    for w in vocab:
        output_file.write(w + '\n')
    output_file.close()  


def run_topic_modeling(topics_output_directory_path, num_topics):

    downsampled_path = topics_output_directory_path + '/authorless.texts.downsampled.' + str(num_topics) + '.txt'
    data_dicts = []
    for _line in open(downsampled_path, 'r'):
        _id, _method, _text = _line.split('\t')
        data_dicts.append({'text': _text,
                           'id': _id,
                           'method': _method})
    data_df = pd.DataFrame(data_dicts)

    mdl = tp.LDAModel(k=num_topics)
    for t in data_df['text']:
        mdl.add_doc(str(t).split())

    for i in range(0, 100, 10):
        mdl.train(10)
        # print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    mdl.save(topics_output_directory_path + '/topics.' + str(num_topics) + '.mdl.bin')

    top_words = []
    word_distribution = []
    keyword_file = open(topics_output_directory_path + '/keywords.' + str(num_topics) + '.txt', 'w')

    for k in range(mdl.k):

        _word_prob_tuples = mdl.get_topic_words(k, top_n=10)
        print('Topic ' + str(k) + ': ' + ', '.join([w for w, p in _word_prob_tuples]))
        keyword_file.write(', '.join([w for w, p in _word_prob_tuples]) + '\n')

        _word_prob_tuples = mdl.get_topic_words(k, top_n=mdl.num_words)
        top_words.append([w for w, p in _word_prob_tuples[:10]])
        word_distribution.append([(w,p) for w, p in sorted(_word_prob_tuples, key=lambda x: x[0])])

    keyword_file.close()

    return (top_words, word_distribution)

In [11]:
num_topics = 30

In [17]:
prepare_authorless_sampling(output_directory_path + '/new-topics-by-me/reddit', num_topics, sampled_reddit_df)
prepare_authorless_sampling(output_directory_path + '/new-topics-by-me/twitter', num_topics, sampled_twitter_df)
prepare_authorless_sampling(output_directory_path + '/new-topics-by-me/webmd', num_topics, sampled_webmd_df)

In [13]:
# Run ./run_authorless_tms.sh on the command line

In [18]:
print('REDDIT')
reddit_top_words, reddit_word_distribution = run_topic_modeling(output_directory_path + '/new-topics-by-me/reddit', num_topics)
print('\nTWITTER')
twitter_top_words, twitter_word_distribution = run_topic_modeling(output_directory_path + '/new-topics-by-me/twitter', num_topics)
print('\nWEBMD')
webmd_top_words, webmd_word_distribution = run_topic_modeling(output_directory_path + '/new-topics-by-me/webmd', num_topics)

REDDIT
Topic 0: got, doctor, said, told, one, went, put, really, first, never
Topic 1: acne, anxiety, skin, depression, bad, months, worse, also, help, go
Topic 2: pregnancy, pregnant, sex, test, negative, know, inside, tests, worried, even
Topic 3: pill, NUM, pills, take, week, taking, day, pack, days, took
Topic 4: hormonal, m, bc, experience, mood, experiences, side, want, effects, people
Topic 5: side, effects, pill, effect, mini, estrogen, progestin, low, hormone, progesterone
Topic 6: im, like, symptoms, feel, dont, usually, may, could, sometimes, well
Topic 7: iud, strings, feel, pain, check, cervix, uterus, ultrasound, place, right
Topic 8: get, doctor, m, appointment, told, make, said, another, sure, see
Topic 9: insertion, NUM, pain, took, experience, cramps, felt, iud, got, painful
Topic 10: implant, nexplanon, m, like, know, want, anyone, would, one, something
Topic 11: like, feel, time, didn, bit, really, would, got, back, much
Topic 12: weight, gain, implanon, side, effec

<br><br><br><br>

# Find Jensen-Shannon divergence between all topics

In [19]:
reddit_vocab = [w for w, p in reddit_word_distribution[0]]
twitter_vocab = [w for w, p in twitter_word_distribution[0]]
webmd_vocab = [w for w, p in webmd_word_distribution[0]]

shared_vocab = [w for w in webmd_vocab if w in reddit_vocab and w in twitter_vocab]

len(shared_vocab)

3244

In [20]:
filtered_reddit_word_distribution = [[(w,p) for (w,p) in _distribution if w in shared_vocab] for _distribution in reddit_word_distribution]
filtered_twitter_word_distribution = [[(w,p) for (w,p) in _distribution if w in shared_vocab] for _distribution in twitter_word_distribution]
filtered_webmd_word_distribution = [[(w,p) for (w,p) in _distribution if w in shared_vocab] for _distribution in webmd_word_distribution]

len(filtered_reddit_word_distribution), len(filtered_twitter_word_distribution), len(filtered_webmd_word_distribution)

(30, 30, 30)

In [21]:
len(filtered_reddit_word_distribution[0]), len(filtered_twitter_word_distribution[1]), len(filtered_webmd_word_distribution[2])

(3244, 3244, 3244)

In [22]:
for i in range(500, 510):
    print(filtered_reddit_word_distribution[0][i], filtered_twitter_word_distribution[1][i], filtered_webmd_word_distribution[2][i])

('clear', 1.344364250144281e-06) ('clear', 6.901168035255978e-06) ('clear', 3.013400601048488e-06)
('cleared', 1.344364250144281e-06) ('cleared', 6.901168035255978e-06) ('cleared', 3.013400601048488e-06)
('clearly', 1.344364250144281e-06) ('clearly', 6.901168035255978e-06) ('clearly', 3.013400601048488e-06)
('click', 1.344364250144281e-06) ('click', 6.901168035255978e-06) ('click', 3.013400601048488e-06)
('climbing', 1.344364250144281e-06) ('climbing', 6.901168035255978e-06) ('climbing', 3.013400601048488e-06)
('clinic', 1.344364250144281e-06) ('clinic', 6.901168035255978e-06) ('clinic', 3.013400601048488e-06)
('clinical', 1.344364250144281e-06) ('clinical', 6.901168035255978e-06) ('clinical', 3.013400601048488e-06)
('clock', 1.344364250144281e-06) ('clock', 6.901168035255978e-06) ('clock', 3.013400601048488e-06)
('close', 1.344364250144281e-06) ('close', 6.901168035255978e-06) ('close', 3.013400601048488e-06)
('closely', 1.344364250144281e-06) ('closely', 6.901168035255978e-06) ('clos

In [23]:
topic_divergences_dict = defaultdict(list)
for k in range(0, num_topics):
    for _distribution1, _distribution2 in combinations([filtered_reddit_word_distribution[k], filtered_twitter_word_distribution[k], filtered_webmd_word_distribution[k]], 2):
        topic_divergences_dict[k].append(jensenshannon([p for (w,p) in _distribution1], [p for (w,p) in _distribution2]))
len(topic_divergences_dict), len(topic_divergences_dict[0])

(30, 3)

In [24]:
for _topic, _divergences in sorted(topic_divergences_dict.items(), key=lambda x: np.mean(x[1]), reverse=True)[-5:]:
    print(_topic)
    print(np.mean(_divergences))
    print(_divergences)
    print('Reddit:', ' '.join(reddit_top_words[_topic]))
    print('Twitter:', ' '.join(twitter_top_words[_topic]))
    print('WebMD:', ' '.join(webmd_top_words[_topic]))
    print()

18
0.7599219776096247
[0.7698436068949125, 0.7646427198381593, 0.7452796060958019]
Reddit: nexplanon NUM bleeding months got implant since anyone year removed
Twitter: iud like copper hormones get one better love body make
WebMD: im like ive dont NUM cant get alot didnt moody

25
0.7559485794071588
[0.7283577980861338, 0.7907556367152896, 0.7487323034200529]
Reddit: get know good think getting re people say read one
Twitter: iud nexplanon would getting get made got thing lol lmao
WebMD: started body bad headaches doctor feeling get never migraines still

27
0.7413139457825154
[0.7017789727016788, 0.7731165473584376, 0.74904631728743]
Reddit: m NUM since time got last thanks nexplanon years hey
Twitter: NUM nexplanon implant years like get implanon arm one got
WebMD: use easy period get one went product completely five wasn

9
0.7402659603876874
[0.7224855225311801, 0.7536567880734265, 0.7446555705584553]
Reddit: insertion NUM pain took experience cramps felt iud got painful
Twitter: iu

In [25]:
for _topic, _divergences in sorted(topic_divergences_dict.items(), key=lambda x: np.mean(x[1]), reverse=True)[:5]:
    print(_topic)
    print(np.mean(_divergences))
    print('Reddit:', ' '.join(reddit_top_words[_topic]))
    print('Twitter:', ' '.join(twitter_top_words[_topic]))
    print('WebMD:', ' '.join(webmd_top_words[_topic]))
    print()

24
0.813546184471219
Reddit: women ovulation NUM //www https ovulate studies contraceptive http rate
Twitter: birth control pill rt u men makes pop recall pull
WebMD: acne face skin clear also broke issue iâ bad happy

12
0.808748831527323
Reddit: weight gain implanon side effects gained lose pounds issues loss
Twitter: //t http contraceptive pill https oral risk cancer may male
WebMD: iud paragard inserted hormones painful little pain know let free

20
0.8033948289972357
Reddit: m get like feel really go getting months know ago
Twitter: //t https birth control implant http via essure safety fda
WebMD: pill bc taking weight pills gain take also love breast

0
0.8033256647981504
Reddit: got doctor said told one went put really first never
Twitter: pill women contraceptive also many health need taking use pills
WebMD: mood swings depression like severe feel pms symptoms crazy life

22
0.8012963067731803
Reddit: period NUM days bleeding spotting week normal weeks first day
Twitter: //t ht

<br><br>

# Remove "clout" tweets

Remove tweets that aren't retweets or quote tweets or exact matches, but that repeat jokes told by many others

In [15]:
# joke_texts = ['fists closed',
#               'fist closed',
#               'born laughingj',
#               'put in her mouth', 
#               'second best thing',
#               '2nd best thing',
#               'woody allen',
#               'i asked a girl to',
#               'a fast word about oral contraception',
#               'a terrific story about',
#               'bullets out of a gun',
#               'bulletproof vest',
#               'popped a molly',
#               'fancy app']

In [153]:
# joke_texts = ['fancy app']
# jokes = []
# for i, r in twitter_posts_df.iterrows():
#     if any(_text in r['text'].lower() for _text in joke_texts):
#         jokes.append(r['text'])
# len(jokes)

In [154]:
# for j in random.sample(jokes, 5):
#     print(j)
#     print()

In [16]:
# print(len(twitter_posts_df.index), len(twitter_replies_df.index))

# twitter_posts_df = twitter_posts_df[~twitter_posts_df.text.str.contains('|'.join(joke_texts), case=False)]
# twitter_replies_df = twitter_replies_df[~twitter_replies_df.text.str.contains('|'.join(joke_texts), case=False)]

# print(len(twitter_posts_df.index), len(twitter_replies_df.index))

# Create word intrusion task

In [92]:
top_words_path = output_directory_path + '/topics-by-leann/topic_top_words.csv'

In [93]:
top_words_df = pd.read_csv(top_words_path)

In [94]:
top_words_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Topic,Top Words,Community
7,7,"starting new packs, skipping pills","['pills', 'NUM', 'week', 'pill', 'pack', 'plac...",reddit
41,41,acne and skin concerns,"['acne', 'face', 'skin', 'never', 'also', 'cys...",webmd
102,102,"changes in period, implant","['NUM', 'years', 'nexplanon', 'got', 'period',...",twitter


In [100]:
for i, r in top_words_df.iterrows():

    if r['Community'] == 'webmd':

        _top_words = [w.strip().strip("'") for w in r['Top Words'].strip('[').strip(']').split(',')]
        _top_words = [w for w in _top_words if w.lower() not in ['num', 'birth', 'control', 'use', 'get', 'would', 'got', 'http', 'https', 
                                                                    'like', 'also', 'ever', 'however', 'haven', 'really', 'dont', 'ive']]
        _top_words = _top_words[:5]

        if i%2 == 0:
            print('\\rowcolor[gray]{0.9}')
        print(r['Topic'] + ' & ' + ', '.join(_top_words) + ' \\\\')

rebutting negative reviews  & reviews, read, reading, experience, people \\
\rowcolor[gray]{0.9}
pregnancy-prevention efficacy & medication, pregnancy, preventing, effective, medications \\
blood clots, blood pressure, stroke, migraines & blood, pressure, high, yeast, infection \\
\rowcolor[gray]{0.9}
implant insertion/removal experience & implant, arm, nexplanon, shot, depo \\
changes in sex drive & sex, drive, low, husband, zero \\
\rowcolor[gray]{0.9}
positive experiences & take, love, little, everyday, thing \\
acne and skin concerns & acne, face, skin, never, cystic \\
\rowcolor[gray]{0.9}
irregular periods, bleeding, highly recommended & periods, recommend, years, product, long \\
anxiety, depression, panic attacks, mood swings & anxiety, nexplanon, depression, implant, began \\
\rowcolor[gray]{0.9}
leg, back pain, stomach aches & pain, pains, back, stomach, legs \\
anxiety, depression, mood swings, fatigue, weight gain & mood, swings, weight, depression, gain \\
\rowcolor[gray]{

In [74]:
dicts_for_labeling = []

for _community in top_words_df['Community'].unique():

    print()
    print(_community)
    print()

    _df = top_words_df[top_words_df['Community'] == _community]

    _top_words_lists = []
    for i, r in _df.iterrows():
        _top_words = [w.strip().strip("'") for w in r['Top Words'].strip('[').strip(']').split(',')]
        _top_words = [w for w in _top_words if w.lower() not in ['num', 'birth', 'control', 'use', 'get', 'would', 'got', 'http', 'https', 
                                                                 'like', 'also', 'ever', 'however', 'haven', 'really', 'dont', 'ive']]
        _top_words_lists.append(_top_words)

    _all_words = [w for _top_words in _top_words_lists for w in _top_words[:10]]
    for i, _top_words in enumerate(_top_words_lists):
        _intruder_candidates = [w for w in _all_words if w not in _top_words]
        _intruder = random.sample(_intruder_candidates, 1)[0]
        _sampled_words = random.sample(_top_words[:5], 4)
        _sampled_words.append(_intruder)
        random.shuffle(_sampled_words)
        print(_sampled_words, _intruder)

        dicts_for_labeling.append({'Community': _community,
                                   'Topic': i,
                                   'Word List': ', '.join(_sampled_words),
                                   'Intruder Word': _intruder})

df_for_labeling = pd.DataFrame(dicts_for_labeling)


reddit

['pregnant', 'day', 'pregnancy', 'test', 'negative'] day
['sex', 'took', 'pill', 'day', 'bleeding'] bleeding
['condoms', 'sex', 'pregnant', 'pregnancy', 'bleeding'] bleeding
['studies', 'pill', 'effectiveness', 'antibiotics', 'reddit'] studies
['antibiotics', 'infection', 'iud', 'infections', 'pregnant'] pregnant
['loss', 'bruising', 'skin', 'face', 'hair'] bruising
['mirena', 'iud', 'hormones', 'start', 'copper'] start
['pills', 'pack', 'bleeding', 'pill', 'week'] bleeding
['day', 'feel', 'boobs', 'nausea', 'night'] boobs
['implant', 'anyone', 'pill', 'getting', 'know'] pill
['months', 'nexplanon', 'bleeding', 'pregnant', 'irregular'] pregnant
['pay', 'parenthood', 'planned', 'cysts', 'health'] cysts
['lost', 'pill', 'pounds', 'gained', 'weight'] pill
['blood', 'effective', 'spotting', 'period', 'discharge'] effective
['pain', 'cysts', 'didn', 'anyone', 'else'] didn
['feel', 'iud', 'check', 'progesterone', 'cervix'] progesterone
['nexplanon', 'low', 'implant', 'effect', 'effe

In [75]:
df_for_labeling.sample(10)

Unnamed: 0,Community,Topic,Word List,Intruder Word
40,webmd,5,"went, take, love, little, everyday",went
23,reddit,23,"might, pill, generic, switched, taking",might
16,reddit,16,"nexplanon, low, implant, effect, effects",low
98,twitter,28,"hormone, time, copper, device, intrauterine",time
99,twitter,29,"implant, essure, remote, contraceptive, best",best
8,reddit,8,"day, feel, boobs, nausea, night",boobs
29,reddit,29,"removed, mood, years, nexplanon, months",mood
24,reddit,24,"risk, insertion, migraines, pill, estrogen",insertion
68,webmd,33,"day, well, began, one, every",began
81,twitter,11,"hormonal, time, pill, taking, take",hormonal


In [76]:
# df_for_labeling.to_csv(output_directory_path + '/topics-by-leann/intruder_test.katherine.csv')

In [67]:
(35-8) / 35

0.7714285714285715

In [62]:
(35-17) / 35

0.5142857142857142

In [68]:
35*3

105

In [78]:
(35-12) / 35

0.6571428571428571

In [79]:
(35-19) / 35

0.45714285714285713

In [80]:
(35-9) / 35

0.7428571428571429

In [81]:
STOPS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
         'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
         'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
         'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
         'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
         'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
         'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
         'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
         'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
         'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
         'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
         'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 've', 'll', 'amp']

In [82]:
', '.join(STOPS)

'i, me, my, myself, we, our, ours, ourselves, you, your, yours, yourself, yourselves, he, him, his, himself, she, her, hers, herself, it, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, should, now, ve, ll, amp'

In [83]:
', '.join(['birth', 'control', 'use', 'get', 'would', 'got', 'http', 'https', 'like', 'also', 'ever', 'however', 'haven', 'really', 'dont', 'ive'])

'birth, control, use, get, would, got, http, https, like, also, ever, however, haven, really, dont, ive'

In [89]:
for _community in top_words_df['Community'].unique():

    print()
    print(_community)
    print()

    _df = top_words_df[top_words_df['Community'] == _community]

    _top_words_lists = []
    for i, r in _df.iterrows():
        _top_words = [w.strip().strip("'") for w in r['Top Words'].strip('[').strip(']').split(',')]
        _top_words = [w for w in _top_words if w.lower() not in ['num', 'birth', 'control', 'use', 'get', 'would', 'got', 'http', 'https', 
                                                                 'like', 'also', 'ever', 'however', 'haven', 'really', 'dont', 'ive']]
        print(_top_words)


reddit

['pregnancy', 'test', 'pregnant', 'negative', 'period', 'tests', 'symptoms', 'sex', 'take', 'weeks', 'took', 'could', 'late', 'days', 'week', 'since', 'know']
['pill', 'sex', 'day', 'took', 'last', 'days', 'period', 'take', 'week', 'taking', 'today', 'started', 'night', 'unprotected', 'morning', 'first', 'pills', 'time', 'plan']
['condoms', 'sex', 'pregnant', 'condom', 'pregnancy', 'using', 'effective', 'boyfriend', 'method', 'used', 'pill', 'time', 'still', 'inside', 'without', 'know', 'want', 'partner']
['www', 'antibiotics', 'pill', 'effectiveness', 'reddit', 'antibiotic', 'contraceptive', 'effective', 'taking', 'nih', 'nlm', 'make', 'ncbi', 'less']
['infection', 'yeast', 'infections', 'iud', 'antibiotics', 'uti', 'symptoms', 'issues', 'discharge', 'could', 'vagina', 'smell', 'cause', 'tested', 'never', 'vaginal', 'thyroid', 'caused']
['acne', 'hair', 'skin', 'loss', 'face', 'back', 'months', 'cystic', 'clear', 'pill', 'years', 'started', 'hormonal', 'mirena', 'worse', 'get