# Instructions

Use "prodigyEnv" conda environment for this notebook.

To set up Prodigy environment, download the wheel file from the Prodigy email (which you receive after purchasing a license). 

Then run `pip install ./prodigy*.whl`

Instructions: https://prodi.gy/docs/install

Database is stored at /

<br><br>

# Imports

In [55]:
from collections import defaultdict
import random
import re

import pandas as pd
from prodigy.components.db import connect

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)

In [56]:
def sort_by_mean(df, by, column, rot=0):
    # use dict comprehension to create new dataframe from the iterable groupby object
    # each group name becomes a column in the new dataframe
    df2 = pd.DataFrame({col:vals[column] for col, vals in df.groupby(by)})
    # find and sort the median values in this new dataframe
    means = df2.mean().sort_values()
    # use the columns in the dataframe, ordered sorted by median value
    # return axes so changes can be made outside the function
#     return df2[meds.index].boxplot(rot=rot, return_type="axes")
    return means

<br><br><br><br>

---

<br><br><br><br>


# Connect to database

In [100]:
db = connect()

db.datasets # This will list all of your prodigy databases

['bc-reddit-posts',
 'bc-reddit-comments',
 'bc-twitter-posts',
 'bc-twitter-replies',
 'discourse-webmd-reviews',
 'discourse-twitter-replies',
 'discourse-reddit-posts',
 'discourse-twitter-posts',
 'discourse-reddit-comments']

In [101]:
# db.drop_dataset('discourse-reddit-comments')  # Only do this if you want to delete all your annotations!!!!!!!!!!!

<br><br><br><br>

---

<br><br><br><br>

# Explore REDDIT posts

In [102]:
examples = db.get_dataset('discourse-reddit-posts')

print(len(examples))

104


In [103]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

37 	 SHARING EXPERIENCES
15 	 NONE
14 	 SEEKING INFORMATION
12 	 SHARING CAUSAL REASONING / HYPOTHESIZING
11 	 SHARING FUTURE PLANS
8 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7 	 SHARING NEGATIVE EMOTIONS
7 	 SHARING OPINIONS AND PREFERENCES
6 	 SHARING PERSONAL BACKGROUND
5 	 SEEKING EXPERIENCES
2 	 SEEKING ADVICE
2 	 SEEKING NORMALITY
1 	 SHARING ADVICE
1 	 SHARING INFORMATION
1 	 META DISCUSSION
1 	 SEEKING EMOTIONAL SUPPORT
1 	 SHARING NORMALITY
1 	 SHARING POSITIVE EMOTIONS


In [104]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
pill
--------------------------------
12 	 SHARING EXPERIENCES
6 	 SHARING CAUSAL REASONING / HYPOTHESIZING
4 	 SHARING PERSONAL BACKGROUND
4 	 SEEKING INFORMATION
3 	 SHARING FUTURE PLANS
2 	 SHARING OPINIONS AND PREFERENCES
2 	 SHARING NEGATIVE EMOTIONS
1 	 SHARING ADVICE
1 	 SHARING INFORMATION
1 	 SEEKING EXPERIENCES
1 	 SEEKING EMOTIONAL SUPPORT
1 	 SEEKING NORMALITY
1 	 SHARING NORMALITY

--------------------------------
iud
--------------------------------
12 	 SHARING EXPERIENCES
7 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
5 	 SHARING FUTURE PLANS
4 	 SEEKING EXPERIENCES
4 	 SEEKING INFORMATION
3 	 SHARING CAUSAL REASONING / HYPOTHESIZING
2 	 SHARING PERSONAL BACKGROUND
2 	 SEEKING ADVICE
1 	 SHARING NEGATIVE EMOTIONS
1 	 META DISCUSSION
1 	 SHARING POSITIVE EMOTIONS

--------------------------------
implant
--------------------------------
13 	 SHARING EXPERIENCES
6 	 SEEKING INFORMATION
5 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING NEGATIVE 

In [105]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', label_count_dict[_label], '\t', _label)

------------------------------
percent of posts with label
------------------------------

35.6% 	 37 	 SHARING EXPERIENCES
14.4% 	 15 	 NONE
13.5% 	 14 	 SEEKING INFORMATION
11.5% 	 12 	 SHARING CAUSAL REASONING / HYPOTHESIZING
10.6% 	 11 	 SHARING FUTURE PLANS
7.7% 	 8 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
6.7% 	 7 	 SHARING NEGATIVE EMOTIONS
6.7% 	 7 	 SHARING OPINIONS AND PREFERENCES
5.8% 	 6 	 SHARING PERSONAL BACKGROUND
4.8% 	 5 	 SEEKING EXPERIENCES
1.9% 	 2 	 SEEKING ADVICE
1.9% 	 2 	 SEEKING NORMALITY
1.0% 	 1 	 SHARING ADVICE
1.0% 	 1 	 SHARING INFORMATION
1.0% 	 1 	 META DISCUSSION
1.0% 	 1 	 SEEKING EMOTIONAL SUPPORT
1.0% 	 1 	 SHARING NORMALITY
1.0% 	 1 	 SHARING POSITIVE EMOTIONS


In [106]:
for _label, _texts in label_texts_dict.items():
    if _label == 'SHARING CAUSAL REASONING / HYPOTHESIZING':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

------------------------------------------
SHARING CAUSAL REASONING / HYPOTHESIZING
------------------------------------------

It lasted for a few weeks but I figured it was because of the change .
But I know it will probably take up to 6 months for my body to re-adjust.
I thought I would be fine since I was on the depo so long beforehand, and didn't even realize this could possibly be a symptom of the implant.
This lasted for MONTHS, so I read somewhere online that vitamin e and zinc help with this, and it did stop the bleeding for a couple weeks, but I just started spotting again today.
I'm wondering if this is a Mirena crash or just really bad PMS since my period is due Monday.
I am taking the pill continuously to not have a period but this spotting is pretty much a period.
And seeing as I wasn’t on it long I’m assuming my libido should be back to normal in a few days?
I also feel that the days before my period I struggle with body dysmorphia and generally low self-esteem.
Probably

<br><br>

# Explore REDDIT comments

In [127]:
examples = db.get_dataset('discourse-reddit-comments')

print(len(examples))

100


In [128]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

23 	 SHARING EXPERIENCES
21 	 SHARING INFORMATION
13 	 NONE
9 	 SHARING ADVICE
8 	 SHARING OPINIONS AND PREFERENCES
5 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
5 	 SEEKING EXPERIENCES
5 	 META DISCUSSION
4 	 SHARING CAUSAL REASONING / HYPOTHESIZING
4 	 SHARING NORMALITY
3 	 SHARING EMOTIONAL SUPPORT
3 	 SHARING FUTURE PLANS
2 	 SEEKING INFORMATION
1 	 SHARING NEGATIVE EMOTIONS
1 	 SHARING PERSONAL BACKGROUND


In [129]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
implant
--------------------------------
9 	 SHARING INFORMATION
7 	 SHARING EXPERIENCES
3 	 META DISCUSSION
2 	 SEEKING EXPERIENCES
2 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
2 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING OPINIONS AND PREFERENCES
1 	 SHARING ADVICE
1 	 SHARING FUTURE PLANS
1 	 SEEKING INFORMATION
1 	 SHARING NORMALITY

--------------------------------
iud
--------------------------------
7 	 SHARING EXPERIENCES
6 	 SHARING INFORMATION
5 	 SHARING OPINIONS AND PREFERENCES
3 	 SEEKING EXPERIENCES
2 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
2 	 SHARING FUTURE PLANS
2 	 SHARING NORMALITY
2 	 META DISCUSSION
1 	 SHARING ADVICE
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING EMOTIONAL SUPPORT
1 	 SHARING PERSONAL BACKGROUND

--------------------------------
pill
--------------------------------
9 	 SHARING EXPERIENCES
7 	 SHARING ADVICE
6 	 SHARING INFORMATION
2 	 SHARING OPINIONS AND PREFERENCES
2 	 SHARING EMOTIONAL SUPPO

In [130]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', label_count_dict[_label], '\t', _label)

------------------------------
percent of posts with label
------------------------------

23.0% 	 23 	 SHARING EXPERIENCES
21.0% 	 21 	 SHARING INFORMATION
13.0% 	 13 	 NONE
9.0% 	 9 	 SHARING ADVICE
8.0% 	 8 	 SHARING OPINIONS AND PREFERENCES
5.0% 	 5 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
5.0% 	 5 	 SEEKING EXPERIENCES
5.0% 	 5 	 META DISCUSSION
4.0% 	 4 	 SHARING CAUSAL REASONING / HYPOTHESIZING
4.0% 	 4 	 SHARING NORMALITY
3.0% 	 3 	 SHARING EMOTIONAL SUPPORT
3.0% 	 3 	 SHARING FUTURE PLANS
2.0% 	 2 	 SEEKING INFORMATION
1.0% 	 1 	 SHARING NEGATIVE EMOTIONS
1.0% 	 1 	 SHARING PERSONAL BACKGROUND


In [132]:
for _label, _texts in label_texts_dict.items():
    if _label == 'SHARING/DESCRIBING ADDITIONAL RESEARCH':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

------------------------------------------
SHARING/DESCRIBING ADDITIONAL RESEARCH
------------------------------------------

My doctor hasn't been concerned by it.
Reading/ watching experience stories has become my nightly routine lol
I really, really want them to do that for me if I decide to get an IUD, especially since I can't take NSAIDS at all, but I haven't heard of anyone getting it.
I’ve just noticed that method seems to have more complaints than others.
Most of the ladies I talked to had their periods for longer than 3 weeks but...


<br><br>

# Explore TWITTER posts

In [112]:
examples = db.get_dataset('discourse-twitter-posts')

print(len(examples))

100


In [113]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

24 	 META DISCUSSION
21 	 NONE
15 	 SHARING EXPERIENCES
15 	 SHARING INFORMATION
14 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7 	 SEEKING INFORMATION
5 	 SHARING CAUSAL REASONING / HYPOTHESIZING
5 	 SHARING FUTURE PLANS
5 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING NEGATIVE EMOTIONS
2 	 SHARING ADVICE
2 	 SHARING PERSONAL BACKGROUND
2 	 SEEKING EXPERIENCES


In [114]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
implant
--------------------------------
9 	 SHARING EXPERIENCES
6 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
5 	 SHARING INFORMATION
5 	 META DISCUSSION
4 	 SHARING CAUSAL REASONING / HYPOTHESIZING
4 	 SHARING FUTURE PLANS
3 	 SHARING NEGATIVE EMOTIONS
3 	 SEEKING INFORMATION
2 	 SHARING ADVICE
1 	 SHARING OPINIONS AND PREFERENCES
1 	 SEEKING EXPERIENCES

--------------------------------
iud
--------------------------------
10 	 META DISCUSSION
5 	 SHARING EXPERIENCES
4 	 SHARING OPINIONS AND PREFERENCES
3 	 SHARING INFORMATION
2 	 SHARING PERSONAL BACKGROUND
2 	 SEEKING INFORMATION
1 	 SHARING FUTURE PLANS
1 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
1 	 SEEKING EXPERIENCES
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING NEGATIVE EMOTIONS

--------------------------------
pill
--------------------------------
9 	 META DISCUSSION
7 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7 	 SHARING INFORMATION
2 	 SEEKING INFORMATION
1 	 SHARING EXPERIENCES



In [115]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', label_count_dict[_label], '\t', _label)

------------------------------
percent of posts with label
------------------------------

24.0% 	 24 	 META DISCUSSION
21.0% 	 21 	 NONE
15.0% 	 15 	 SHARING EXPERIENCES
15.0% 	 15 	 SHARING INFORMATION
14.0% 	 14 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7.0% 	 7 	 SEEKING INFORMATION
5.0% 	 5 	 SHARING CAUSAL REASONING / HYPOTHESIZING
5.0% 	 5 	 SHARING FUTURE PLANS
5.0% 	 5 	 SHARING OPINIONS AND PREFERENCES
4.0% 	 4 	 SHARING NEGATIVE EMOTIONS
2.0% 	 2 	 SHARING ADVICE
2.0% 	 2 	 SHARING PERSONAL BACKGROUND
2.0% 	 2 	 SEEKING EXPERIENCES


In [116]:
for _label, _texts in label_texts_dict.items():
    if _label == 'SHARING INFORMATION':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

------------------------------------------
SHARING INFORMATION
------------------------------------------

(1960) First contraceptive pill made available for women, who can now make their https://t.co/xdjj2owDmY https://t.co/LfSuOtTsmB
It cause a lot of hormonal imbalances.
http://t.co/fWeL2X2M IUD Beats Pill at Preventing Pregnancy - WebMD: http://t.co/vpcDmI3X IUD Beats… http://t.co/UvM4gSDM
RT New Birth Control Pill Beyaz Includes Folic Acid, Columnist Writes http://bit.ly/bFCMNr #contraception #prochoice
A male version of the #IUD may finally be on the way!
Unlike some other methods, the contraceptive implant is not affected by common antibiotics, diarrhoea or vomiting.
Future reproductive lifespan may be lessened in oral contraceptive users: Lower measures of ovarian reserve: http://t.co/QD0Yj8msqB
http://t.co/LZybfYES The contraceptive pill could reduce risk of ovarian and uterine cancer in nuns
"Bayer halves the price of its contraceptive implant Jadelle® for women in developing

<br><br>

# Explore Twitter REPLIES

In [117]:
examples = db.get_dataset('discourse-twitter-replies')

print(len(examples))

109


In [118]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

30 	 SHARING EXPERIENCES
26 	 META DISCUSSION
24 	 NONE
15 	 SHARING INFORMATION
8 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING PERSONAL BACKGROUND
3 	 SHARING FUTURE PLANS
3 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
3 	 SHARING ADVICE
2 	 SEEKING INFORMATION
2 	 SEEKING EXPERIENCES
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING NEGATIVE EMOTIONS


In [119]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
pill
--------------------------------
18 	 META DISCUSSION
7 	 SHARING EXPERIENCES
2 	 SHARING INFORMATION
1 	 SHARING PERSONAL BACKGROUND
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING OPINIONS AND PREFERENCES
1 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
1 	 SEEKING INFORMATION
1 	 SEEKING EXPERIENCES

--------------------------------
iud
--------------------------------
8 	 META DISCUSSION
6 	 SHARING INFORMATION
6 	 SHARING EXPERIENCES
4 	 SHARING OPINIONS AND PREFERENCES
1 	 SHARING FUTURE PLANS
1 	 SEEKING EXPERIENCES
1 	 SHARING ADVICE
1 	 SHARING NEGATIVE EMOTIONS
1 	 SHARING PERSONAL BACKGROUND

--------------------------------
implant
--------------------------------
17 	 SHARING EXPERIENCES
7 	 SHARING INFORMATION
3 	 SHARING OPINIONS AND PREFERENCES
2 	 SHARING FUTURE PLANS
2 	 SHARING PERSONAL BACKGROUND
2 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
2 	 SHARING ADVICE
1 	 SEEKING INFORMATION



In [120]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', label_count_dict[_label], '\t', _label)

------------------------------
percent of posts with label
------------------------------

27.5% 	 30 	 SHARING EXPERIENCES
23.9% 	 26 	 META DISCUSSION
22.0% 	 24 	 NONE
13.8% 	 15 	 SHARING INFORMATION
7.3% 	 8 	 SHARING OPINIONS AND PREFERENCES
3.7% 	 4 	 SHARING PERSONAL BACKGROUND
2.8% 	 3 	 SHARING FUTURE PLANS
2.8% 	 3 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
2.8% 	 3 	 SHARING ADVICE
1.8% 	 2 	 SEEKING INFORMATION
1.8% 	 2 	 SEEKING EXPERIENCES
0.9% 	 1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
0.9% 	 1 	 SHARING NEGATIVE EMOTIONS


In [121]:
for _label, _texts in label_texts_dict.items():
    if _label == 'SHARING OPINIONS AND PREFERENCES':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

------------------------------------------
SHARING OPINIONS AND PREFERENCES
------------------------------------------

oh helll nahhhhhh I had one with hormones and that shit was horrible.
im just so reluctant to start meds bc it took so many attempts to get to a contraceptive pill that didnt fuck
oh bitch fuck nexplanon.
Miruiana sounds too close to the IUD I have
on my third mirena iud and can't say enough good things about it although I know my experiences aren't universal, but I haven't had my period in over ten years and I'm so grateful
I have the IUD now I’m want something different
Now I’m glad I didn’t.
yea girl nexplanon for life


<br><br><br><br>

# Explore WebMD reviews

In [122]:
examples = db.get_dataset('discourse-webmd-reviews')

print(len(examples))

100


In [123]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

75 	 SHARING EXPERIENCES
22 	 SHARING OPINIONS AND PREFERENCES
7 	 SHARING PERSONAL BACKGROUND
6 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
6 	 SHARING FUTURE PLANS
3 	 SHARING NEGATIVE EMOTIONS
3 	 SHARING CAUSAL REASONING / HYPOTHESIZING
2 	 NONE
2 	 META DISCUSSION
1 	 SHARING INFORMATION
1 	 SHARING POSITIVE EMOTIONS
1 	 SHARING ADVICE


In [124]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
pill
--------------------------------
23 	 SHARING EXPERIENCES
3 	 SHARING OPINIONS AND PREFERENCES
2 	 SHARING PERSONAL BACKGROUND
1 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
1 	 META DISCUSSION
1 	 SHARING FUTURE PLANS

--------------------------------
iud
--------------------------------
25 	 SHARING EXPERIENCES
7 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
4 	 SHARING PERSONAL BACKGROUND
2 	 SHARING NEGATIVE EMOTIONS
2 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING INFORMATION
1 	 SHARING POSITIVE EMOTIONS
1 	 SHARING FUTURE PLANS

--------------------------------
implant
--------------------------------
27 	 SHARING EXPERIENCES
12 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING FUTURE PLANS
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 META DISCUSSION
1 	 SHARING PERSONAL BACKGROUND
1 	 SHARING ADVICE
1 	 SHARING NEGATIVE EMOTIONS
1 	 SHARING/DESCRIBING ADDITIONAL RESEARCH



In [125]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', label_count_dict[_label], '\t', _label)

------------------------------
percent of posts with label
------------------------------

75.0% 	 75 	 SHARING EXPERIENCES
22.0% 	 22 	 SHARING OPINIONS AND PREFERENCES
7.0% 	 7 	 SHARING PERSONAL BACKGROUND
6.0% 	 6 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
6.0% 	 6 	 SHARING FUTURE PLANS
3.0% 	 3 	 SHARING NEGATIVE EMOTIONS
3.0% 	 3 	 SHARING CAUSAL REASONING / HYPOTHESIZING
2.0% 	 2 	 NONE
2.0% 	 2 	 META DISCUSSION
1.0% 	 1 	 SHARING INFORMATION
1.0% 	 1 	 SHARING POSITIVE EMOTIONS
1.0% 	 1 	 SHARING ADVICE


In [126]:
for _label, _texts in label_texts_dict.items():
    if _label == 'SHARING/DESCRIBING ADDITIONAL RESEARCH':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

------------------------------------------
SHARING/DESCRIBING ADDITIONAL RESEARCH
------------------------------------------

So if anyone is wondering, I have read all of these comments and they all apply.bloating, nausea, weight/appetite gain (it's been 12 days) extreme rage, breat tenderness and lumps.
So i went back again, gave me different meds, which i couldnt take upset stomach along with my regular symptoms then i kept having this pain, and hardness in my lower left abdomen, painful sex, doctor said my cervix and uterus were swollen, cramping, just horrible pain and very annoying.
While doing my research I read quite a few reviews about how much it hurt.
Before getting this form of birth control, I read tons of reviews.
But that was all explained to me before I chose to get it.
Also hearing some woman got pregnant while on Nexplanon is scary.


<br><br><br><br>

---

<br><br><br><br>

# Backup labeling into a CSV

In [133]:
reddit_post_examples = db.get_dataset('discourse-reddit-posts')
reddit_comment_examples = db.get_dataset('discourse-reddit-comments')
twitter_post_examples = db.get_dataset('discourse-twitter-posts')
twitter_replies_examples = db.get_dataset('discourse-twitter-replies')
webmd_reviews_examples = db.get_dataset('discourse-webmd-reviews')

In [134]:
len(reddit_post_examples), len(reddit_comment_examples), len(twitter_post_examples), len(twitter_replies_examples), len(webmd_reviews_examples)

(104, 100, 100, 109, 100)

In [135]:
label_dicts = []
for e in reddit_post_examples + reddit_comment_examples + twitter_post_examples + twitter_replies_examples + webmd_reviews_examples:
    for _label in e['accept']:
        label_dicts.append({'Source': e['meta']['Source'],
                            'ID': e['meta']['ID'],
                            'Label': _label,
                            'Text': e['text']})
    if len(e['accept']) == 0:
        label_dicts.append({'Source': e['meta']['Source'],
                            'ID': e['meta']['ID'],
                            'Label': 'NONE',
                            'Text': e['text']})
label_df = pd.DataFrame(label_dicts)

In [136]:
len(label_df)

611

In [137]:
label_df['Label'].value_counts()

SHARING EXPERIENCES                         180
NONE                                         75
META DISCUSSION                              58
SHARING INFORMATION                          53
SHARING OPINIONS AND PREFERENCES             50
SHARING/DESCRIBING ADDITIONAL RESEARCH       36
SHARING FUTURE PLANS                         28
SEEKING INFORMATION                          25
SHARING CAUSAL REASONING / HYPOTHESIZING     25
SHARING PERSONAL BACKGROUND                  20
SHARING ADVICE                               16
SHARING NEGATIVE EMOTIONS                    16
SEEKING EXPERIENCES                          14
SHARING NORMALITY                             5
SHARING EMOTIONAL SUPPORT                     3
SEEKING ADVICE                                2
SEEKING NORMALITY                             2
SHARING POSITIVE EMOTIONS                     2
SEEKING EMOTIONAL SUPPORT                     1
Name: Label, dtype: int64

In [138]:
label_df['Source'].value_counts()

reddit-posts       132
webmd-reviews      129
twitter-replies    122
twitter-posts      121
reddit-comments    107
Name: Source, dtype: int64

In [139]:
label_df.sample(3)

Unnamed: 0,Source,ID,Label,Text
401,twitter-replies,188078874276528130,META DISCUSSION,as she proceeds to pop a birth control pill an...
270,twitter-posts,230487448063447040,NONE,Now I got a baby from a drunk dyslexic”
350,twitter-posts,492556993887211500,SHARING FUTURE PLANS,Having my contraceptive implant out today :-) ...


In [140]:
for i, r in label_df[label_df['Label'] == 'NONE'].sample(10).iterrows():
    print(' '.join(r['Text'].split()))

Bitch i got dick wanna fuck?
Until about 5 months in.
but it didn't!!!
I want to reach out and offer help, but I’m not sure how to do that.
multi-tasking is 2 much huh?
But more than ... https://t.co/D2719YX5z9
Been like 5-6 months now!
Now to figure out all that IUD stuff T.T
A birth control pill for men..
My oldest is a HS teacher.


In [142]:
label_df.to_csv('/Users/maria/Documents/data/birth-control/labeling/label-discourse/labeled_by_maria.all.csv')

<br><br><br><br>

---

<br><br><br><br>

# Try training a simple model

In [482]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'
test_df = pd.read_csv(data_directory_path + '/labeling/label-discourse/sampled-sentences.test.csv')
len(test_df)

11993

In [483]:
test_df.sample(3)

Unnamed: 0.1,Unnamed: 0,text,meta
3859,3859,The only thing that works that late is the cop...,"{'ID': 'eyx0ilp', 'Source': 'reddit-comments',..."
11745,11745,i have been on this for 7 months and i bleed f...,"{'ID': 'w11392', 'Source': 'webmd-reviews', 'M..."
10011,10011,The entire time was awful.,"{'ID': 'w12188', 'Source': 'webmd-reviews', 'M..."


In [484]:
len(label_df.index)

895

In [465]:
label_df.sample(3)

Unnamed: 0,Source,ID,Label,Text
516,twitter-posts,300336102865248260,narrating personal experiences,getting the contraceptive implant in possibly ...
96,reddit-posts,gjld30,narrating personal experiences,Although sometimes I feel like I am gonna have...
39,reddit-posts,c2gb30,seeking information (advice),Can I take these steri-strips (butterfly stitc...


In [458]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [460]:
def binarize_label(label, target_label):
    if label == target_label:
        return 1
    return 0

In [510]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 50:

        _train_df, _test_df = train_test_split(_binarized_df, test_size=0.33, random_state=42)

        _train_texts = _train_df['Text']
        _train_labels = _train_df['Label']
        _test_texts = _test_df['Text']
        _test_labels = _test_df['Label']

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts)
        _X_test = _vectorizer.transform(_test_texts)

        _model = LogisticRegression(C=10).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print(_target_label)
        print(classification_report(_test_labels, _predictions))

negative self-disclosure
              precision    recall  f1-score   support

           0       0.77      0.59      0.67        17
           1       0.59      0.77      0.67        13

    accuracy                           0.67        30
   macro avg       0.68      0.68      0.67        30
weighted avg       0.69      0.67      0.67        30

narrating personal experiences
              precision    recall  f1-score   support

           0       0.82      0.82      0.82        88
           1       0.83      0.83      0.83        94

    accuracy                           0.82       182
   macro avg       0.82      0.82      0.82       182
weighted avg       0.82      0.82      0.82       182

seeking experiences
              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.79      0.85      0.81        13

    accuracy                           0.82        28
   macro avg       0.82      0.82      0.82        2

In [478]:
_binarized_df['Label'].value_counts()

0    16
1    16
Name: Label, dtype: int64

In [502]:
def process_string(text):
    text = text.lower()
    text = re.sub('[0-9]+', 'NUM', text)
    text = re.sub(r'[^\sA-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ/]', ' \1 ', text)
    text = ' '.join(text.split())
    return text

In [505]:
t = process_string('Does this work? Hmmm,how about this???')

'does this work \x01 hmmm \x01 how about this \x01 \x01 \x01'

In [511]:
for _target_label in label_df['Label'].unique():

    _binarized_df = label_df.copy()
    _binarized_df['Label'] = label_df['Label'].apply(lambda x: binarize_label(x, _target_label))
    _positive_ids = _binarized_df[_binarized_df['Label'] == 1]['ID'].tolist()
    _binarized_df = _binarized_df[~((_binarized_df['ID'].isin(_positive_ids)) & (_binarized_df['Label'] == 0))]

    _binarized_df = _binarized_df.groupby('Label').sample(n=len(_binarized_df[_binarized_df['Label'] == 1]), random_state=1)

    if len(_binarized_df.index) > 50:

        _train_texts = _binarized_df['Text']
        _train_labels = _binarized_df['Label']

        _test_texts = test_df['text']

        _train_texts_processed = [process_string(t) for t in _train_texts]
        _test_texts_processed  = [process_string(t) for t in _test_texts]

        _vectorizer = TfidfVectorizer()
        _X_train = _vectorizer.fit_transform(_train_texts_processed)
        _X_test = _vectorizer.transform(_test_texts_processed)

        _model = LogisticRegression(C=10).fit(_X_train, _train_labels)
        _predictions = _model.predict(_X_test)

        print('---------------------------------')
        print(_target_label)
        print('---------------------------------')
        print()

        _positive_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 1]
        _negative_texts = [_text for _prediction, _text in zip(_predictions, _test_texts) if _prediction == 0]

        print('POSITIVE')
        for _text in random.sample(_positive_texts, 10):
            print(' '.join(_text.split()))
        
        print()


---------------------------------
negative self-disclosure
---------------------------------

POSITIVE
the implanon has given me MAD mood swings fml
No weight gain either, just got my period a few times/year.
So, I started this pill a month and a half and it's been great , I haven't spotted at all or gotten any acne like some have said, but I have been very depressed since starting and it just occurred to me today to look up side effects because I have never felt like this ever
A day late isn’t really late.
THIS THING!!!good luck and give it a try!~
If it gets knocked out it wasnt put in right RT :
I had the Implanon and found it sent my sex drive plummeting whilst making me put on weight it was awful
Just watched a video of how the nexplanon implant is removed and now I wanna cry😫
The chance of an IUD dislodging is actually really low (despite horror stories on reddit).
I was looking on getting a implanon but the side effects scare me a bit lol

---------------------------------
narra