In [2]:
from collections import defaultdict
from datetime import datetime
import dill
from itertools import permutations, combinations
import json
from operator import itemgetter
import os
import pickle
import random
import re
import time

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

import little_mallet_wrapper as lmw

In [3]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'
output_directory_path = '/Users/maria/Documents/output/birth-control'
topics_directory_path = '/Users/maria/Documents/output/birth-control/topics-by-leann/revision-final'

<br><br><br><br>

# **Load topics**

In [4]:
source_topic_keys_dict = {}
source_distributions_dict = {}
source_training_dict = {}

for _source in ['reddit', 'twitter', 'webmd']:

    source_topic_keys_dict[_source] = [_line.split('\t')[2].split()[:10] for _line in open(topics_directory_path + '/' + _source + '/mallet.topic_keys.35', 'r') if _line.strip()]
    source_distributions_dict[_source] = [_line.split('\t')[2:] for _line in open(topics_directory_path + '/' + _source + '/mallet.topic_distributions.35', 'r') if _line.strip()]
    source_training_dict[_source] = [' '.join(_line.split()[2:]) for _line in open(topics_directory_path + '/' + _source + '/training.txt', 'r') if _line.strip()]

len(source_topic_keys_dict['reddit']), len(source_distributions_dict['reddit']), len(source_training_dict['reddit']), len(source_distributions_dict['twitter']), len(source_training_dict['twitter']), len(source_distributions_dict['webmd']), len(source_training_dict['webmd'])

(35, 48000, 48000, 48000, 48000, 2640, 2640)

In [5]:
source_training_dict['reddit'][0]

'girlfriend NUM years old sexually active majority relationship implant never unprotected sex ejeculated near vagina today unprotected sex first time couldn pull time realize pull method effective help piece mind currently implant left arm nexplanon super bad anxiety currently medicate worst anxiety situation graduated got accepted one countries best pre law programs need little comfort want worry seems bothered sorry rant induced freaking words advice would appreciated'

In [6]:
for _source, _topic_keys in source_topic_keys_dict.items():
    print('-----------------------------------------------')
    print(_source)
    print('-----------------------------------------------')
    print()
    for _keys in _topic_keys:
        print(', '.join(_keys))
    print()
    print()

-----------------------------------------------
reddit
-----------------------------------------------

pregnancy, test, pregnant, negative, NUM, period, tests, symptoms, sex, take
pill, NUM, sex, day, took, last, days, period, take, week
condoms, sex, use, pregnant, condom, pregnancy, using, effective, boyfriend, method
www, https, http, antibiotics, NUM, control, birth, use, pill, effectiveness
infection, yeast, infections, iud, antibiotics, uti, symptoms, also, issues, discharge
acne, hair, skin, NUM, loss, face, back, months, cystic, clear
iud, mirena, copper, hormonal, hormones, paragard, periods, want, get, would
pills, NUM, week, pill, pack, placebo, taking, take, period, start
like, NUM, day, feel, nausea, night, feeling, stomach, felt, got
know, like, really, anyone, get, would, thanks, implant, getting, want
bleeding, nexplanon, stop, irregular, months, implant, bleed, periods, pill, take
insurance, NUM, planned, parenthood, get, health, would, pay, covered, parents
weight, N

In [17]:
# source_distributions_dict = {}
# source_training_dict = {}

dicts_to_label = []
for _source, _topic_keys in source_topic_keys_dict.items():

    for _topic, _keys in enumerate(_topic_keys):

        _doc_probability_dict = {}
        for _distribution, _document in zip(source_distributions_dict[_source], source_training_dict[_source]):
            _doc_probability_dict[_document] = float(_distribution[_topic])

        _top_docs = '\n\n'.join([_doc for _doc, _probability in sorted(_doc_probability_dict.items(), key=lambda x: x[1], reverse=True)[:10]])

        dicts_to_label.append({'source': _source,
                               'topic_index': _topic,
                               'top_words': ', '.join(_keys),
                               'top_docs': _top_docs})

df_to_label = pd.DataFrame(dicts_to_label)
len(df_to_label.index)

105

In [18]:
df_to_label.sample(5)

Unnamed: 0,source,topic_index,top_words,top_docs
72,webmd,2,"blood, pressure, high, yeast, infection, clots...",num put apri help regulate periods etc using d...
27,reddit,27,"NUM, period, got, days, since, month, started,...",got nexplanon implant august NUM NUM months go...
28,reddit,28,"arm, implant, nexplanon, feel, insertion, got,...",got nexplanon pretty easy kind nervous though ...
16,reddit,16,"side, effects, effect, nexplanon, implant, exp...",good idea long term side effects NUM months us...
25,reddit,25,"experience, stories, people, experiences, read...",glad posted read million horror stories nexpla...


In [19]:
df_to_label.to_csv(output_directory_path + '/topics_to_label.csv')

In [63]:
# reddit_strings = """

#                 pregnancy tests, symptoms & pregnancy, test, pregnant, negative, period \\
#         missed pills, unprotected sex & pill, sex, day, took, last \\
        
#         questions about condom-use, pregnancy concerns & condoms, sex, pregnant, condom, pregnancy \\
#         informative outlinks, efficacy & www, antibiotics, pill, effectiveness, reddit \\
        
#         infections & infection, yeast, infections, iud, antibiotics \\
#         acne, skin and hair concerns & acne, hair, skin, loss, face \\
        
#         iud, hormonal vs. copper & iud, mirena, copper, hormonal, hormones \\
#         starting new packs, skipping pills & pills, week, pill, pack, placebo \\
        
#         nausea, upset stomach & day, feel, nausea, night, feeling \\
#         implant experience, advice-seeking & know, anyone, thanks, implant, getting \\
        
#         implant irregular bleeding, spotting & bleeding, nexplanon, stop, irregular, months \\
#         costs, insurance, planned parenthood & insurance, planned, parenthood, health, pay \\
        
#         weight gain, losing weight & weight, gain, gained, pounds, lost \\
#         bleeding, spotting, discharge & blood, bleeding, period, spotting, discharge \\
        
#         pain cramping, cysts & pain, cramps, anyone, else, cysts \\
#         iud string concerns & iud, strings, feel, cervix, check \\
        
#         discussion of side effects & side, effects, effect, nexplanon, implant \\
#         anxiety,depression, mood swings & feel, anxiety, mood, depression, time \\
        
#         considering new methods & pill, pills, taking, hormonal, take \\
#         menstrual products & cup, menstrual, iud, using, tampons \\
        
#         missing pills, effectiveness & pill, take, time, taking, day \\
#         hormone dosages & pill, estrogen, dose, pills, progestin \\
        
#         doctor appointments, interactions & doctor, said, told, appointment, went \\
#         switching pills, generics & pill, taking, tri, switched, generic \\
        
#         blood clots, migraines, risk of stroke & pill, migraines, blood, risk, estrogen \\
#         experience-sharing, rebutting negative stories & experience, stories, people, experiences, read \\
        
#         iud insertion experience & pain, insertion, iud, cramps, felt \\
#         changes in period, spotting & period, days, since, month, started \\
        
#         implant insertion experience & arm, implant, nexplanon, feel, insertion \\
#         implant removal & nexplanon, implant, years, removed, months \\
        
#         method efficacy & effective, women, implant, nexplanon, years \\
#         changes in sex drive & sex, drive, libido, want, feel \\
        
#         ovulation & ovulation, cycle, ovulate, fertility, days \\
#         recommending to talk with doctor & doctor, think, body, good, know \\
        
#         heavy, light period, spotting, time periods & period, periods, months, days, first \\

#                  """

# twitter_strings = """

#                         implant experiences & arm, nexplanon, implant, implanon, rod \\
#         iud insertion experiences & iud, pain, insertion, uterus, doctor \\
        
#         iud discourse and humor & iud, thanks, baby, day, today \\
#         iud viral stories & baby, iud, born, fingers, tiny \\
        
#         male contraception development & pill, contraceptive, male, men, scientists \\
#         experience-sharing & nexplanon, implanon, implant, best, arm \\
        
#         informative news stories & iud, women, iuds, contraception, long \\
#         pill viral quotes & pill, pregnancy, best, thing, put \\
        
#         iud political news & iud, abortion, anti, must, healthcare \\
#         iud viral stories  & implant, contraceptive, school, girl, without \\
        
#         costs, insurance, planned parenthood & iud, insurance, free, planned, health \\
#         pill viral tweets & pill, take, implant, taking, time \\
        
#         pill informative news & pill, male, bit, new, men \\
#         iud misc tweets & iud, god, thank, bless, house \\
        
#         weight gain, loss & weight, gain, nexplanon, gained, implanon \\
#         FDA news, Woody Allen quote & oral, contraceptive, contraception, fda, new \\
        
#         iud discourse, humor & iud, looks, ied, sex, make \\
#         contraceptive studies & contraceptive, risk, oral, pill, cancer \\
        
#         contraception misinformation & pill, contraceptive, women, oral, first \\
#         FDA announcements & implant, fda, essure, safety, review \\
        
#         pill viral quote & pill, men, male, take, makes \\
#         upcoming iud procedure & iud, getting, nexplanon, today, appointment \\
        
#         pill news & pill, dies, djerassi, father, carl \\
#         iud viral tweets & iud, know, tubes, need, pregnant \\
        
#         changes in periods, cramps & iud, nexplanon, period, periods, cramps \\
#         adverse side effects & side, effects, pill, effect, contraceptive \\
        
#         misc tweets & years, effective, iud, rate, implanon \\
#         humor & iud, implanon, nexplanon, lol, shit \\
        
#         iud informative tweets & iud, copper, device, intrauterine, hormone \\
#         implant complications & implant, contraceptive, essure, bayer, remote \\
        
#         abortion, contraception, political discourse & pill, iud, women, abortion, contraceptive \\
#         LARC recommendations, advice & iud, nexplanon, one, know, people \\
        
#         changes in period, implant & years, nexplanon, period, months, year \\
#         emergency contraception & contraceptive, pill, oral, pills, contraception \\
        
#         LARC discourse, news  & iud, implanon, question, bit, post \\

#                   """

# webmd_strings = """

#                 rebutting negative reviews  & reviews, read, reading, experience, people \\
        
#         pregnancy-prevention & medication, pregnancy, preventing, effective, medications \\
#         blood clots, blood pressure, stroke, migraines & blood, pressure, high, yeast, infection \\
        
#         implant insertion/removal experience & implant, arm, nexplanon, shot, depo \\
#         changes in sex drive & sex, drive, low, husband, zero \\
        
#         positive experiences & take, love, little, everyday, thing \\
#         acne and skin concerns & acne, face, skin, never, cystic \\
        
#         irregular periods, bleeding, highly recommended & periods, recommend, years, product, long \\
#         anxiety, depression, panic attacks, mood swings & anxiety, nexplanon, depression, implant, began \\
        
#         leg, back pain, stomach aches & pain, pains, back, stomach, legs \\
#         anxiety, depression, mood swings, fatigue, weight gain & mood, swings, weight, depression, gain \\
        
#         years, months, duration of use & years, year, almost, old, one \\
#         switching pills, generic & switched, generic, pharmacy, years, ortho \\
        
#         iud, ovarian cysts, surgery & pain, mirena, removed, iud, painful \\
#         heavy bleeding, spotting & heavy, tampons, wear, needed, pads \\
        
#         pill acne, headaches, breast pain, cramps & pill, taking, noticed, acne, breast \\
#         risk-benefit analysis & cannot, low, dose, gyn, device \\
        
#         iud insertion experience & pain, insertion, cramping, iud, inserted \\
#         experience-sharing & months, recommend, form, different, taking \\
        
#         life interferences & feel, time, never, feeling, made \\
#         bleeding, spotting, changes in cycle & months, period, days, periods, weeks \\
        
#         comments on pregnancy-prevention & pregnant, time, getting, pregnancy, yet \\
#         doctor visits, communication & doctor, never, said, bad, could \\
        
#         doctor interactions, visits & back, going, work, still, even \\
#         presence/absence of "side effects" & side, effects, effect, headaches, drug \\
        
#         implant changes in cycle, bleeding & bleeding, since, months, implanon, removed \\
#         weight gain, losing weight & weight, gained, gain, pounds, lbs \\
        
#         bleeding, cramps & worse, cramps, bleeding, bad, super \\
#         side effects after taking the pill & pill, taking, pills, started, take \\
        
#         iud string concerns & pain, strings, went, find, told \\
#         users with children & mirena, put, years, inserted, child \\
        
#         costs, insurance & insurance, loestrin, pay, cost, cover \\
#         hair loss, facial hair growth & hair, loss, started, falling, facial \\
        
#         experience-sharing pt 2 & day, take, every, one, well \\
#         spotting for extended periods of time & period, month, first, two, spotting \\

#                 """

# source_string_dict = {}

# source_string_dict['reddit'] = [s.strip() for s in reddit_strings.split('\n') if s.strip()]
# source_string_dict['reddit']  = [s.split('&')[0].strip() for s in source_string_dict['reddit'] ]

# source_string_dict['twitter'] = [s.strip() for s in twitter_strings.split('\n') if s.strip()]
# source_string_dict['twitter']  = [s.split('&')[0].strip() for s in source_string_dict['twitter'] ]

# source_string_dict['webmd'] = [s.strip() for s in webmd_strings.split('\n') if s.strip()]
# source_string_dict['webmd']  = [s.split('&')[0].strip() for s in source_string_dict['webmd'] ]

# for _source, _string in source_string_dict.items():
#         print(len(_string))

35
35
35
