# Instructions

Make sure you are using a Python environment that has Prodigy installed. You'll need a wheel file from Maria.

Instructions: https://prodi.gy/docs/install

<br><br>

# Imports

In [2]:
from collections import defaultdict
import random

import pandas as pd
from prodigy.components.db import connect

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)

In [3]:
def sort_by_mean(df, by, column, rot=0):
    # use dict comprehension to create new dataframe from the iterable groupby object
    # each group name becomes a column in the new dataframe
    df2 = pd.DataFrame({col:vals[column] for col, vals in df.groupby(by)})
    # find and sort the median values in this new dataframe
    means = df2.mean().sort_values()
    # use the columns in the dataframe, ordered sorted by median value
    # return axes so changes can be made outside the function
#     return df2[meds.index].boxplot(rot=rot, return_type="axes")
    return means

<br><br>

# Connect to database

In [4]:
db = connect()

db.datasets # This will list all of your prodigy databases

['bc-reddit-posts',
 'bc-reddit-comments',
 'bc-twitter-posts',
 'bc-twitter-replies',
 'discourse-webmd-reviews',
 'discourse-twitter-replies',
 'discourse-reddit-posts',
 'discourse-twitter-posts',
 'discourse-reddit-comments']

In [None]:
# CAUTION: Only do this if you want to delete all your annotations!!!!!!!!!!!
# db.drop_dataset('dataset_name_here')  

<br><br>

# Explore REDDIT comments

In [5]:
examples = db.get_dataset('discourse-reddit-comments')

print(len(examples))

0


In [6]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------



In [7]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

In [8]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', _label)

------------------------------
percent of posts with label
------------------------------



In [9]:
for _label, _texts in label_texts_dict.items():
    if _label == 'providing personal experiences':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

<br><br>

# Explore REDDIT posts

In [10]:
examples = db.get_dataset('discourse-reddit-posts')

print(len(examples))

104


In [11]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

37 	 SHARING EXPERIENCES
15 	 NONE
14 	 SEEKING INFORMATION
12 	 SHARING CAUSAL REASONING / HYPOTHESIZING
11 	 SHARING FUTURE PLANS
8 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7 	 SHARING NEGATIVE EMOTIONS
7 	 SHARING OPINIONS AND PREFERENCES
6 	 SHARING PERSONAL BACKGROUND
5 	 SEEKING EXPERIENCES
2 	 SEEKING ADVICE
2 	 SEEKING NORMALITY
1 	 SHARING ADVICE
1 	 SHARING INFORMATION
1 	 META DISCUSSION
1 	 SEEKING EMOTIONAL SUPPORT
1 	 SHARING NORMALITY
1 	 SHARING POSITIVE EMOTIONS


In [12]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
pill
--------------------------------
12 	 SHARING EXPERIENCES
6 	 SHARING CAUSAL REASONING / HYPOTHESIZING
4 	 SHARING PERSONAL BACKGROUND
4 	 SEEKING INFORMATION
3 	 SHARING FUTURE PLANS
2 	 SHARING OPINIONS AND PREFERENCES
2 	 SHARING NEGATIVE EMOTIONS
1 	 SHARING ADVICE
1 	 SHARING INFORMATION
1 	 SEEKING EXPERIENCES
1 	 SEEKING EMOTIONAL SUPPORT
1 	 SEEKING NORMALITY
1 	 SHARING NORMALITY

--------------------------------
iud
--------------------------------
12 	 SHARING EXPERIENCES
7 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
5 	 SHARING FUTURE PLANS
4 	 SEEKING EXPERIENCES
4 	 SEEKING INFORMATION
3 	 SHARING CAUSAL REASONING / HYPOTHESIZING
2 	 SHARING PERSONAL BACKGROUND
2 	 SEEKING ADVICE
1 	 SHARING NEGATIVE EMOTIONS
1 	 META DISCUSSION
1 	 SHARING POSITIVE EMOTIONS

--------------------------------
implant
--------------------------------
13 	 SHARING EXPERIENCES
6 	 SEEKING INFORMATION
5 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING NEGATIVE 

In [13]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', _label)

------------------------------
percent of posts with label
------------------------------

35.6% 	 SHARING EXPERIENCES
14.4% 	 NONE
13.5% 	 SEEKING INFORMATION
11.5% 	 SHARING CAUSAL REASONING / HYPOTHESIZING
10.6% 	 SHARING FUTURE PLANS
7.7% 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
6.7% 	 SHARING NEGATIVE EMOTIONS
6.7% 	 SHARING OPINIONS AND PREFERENCES
5.8% 	 SHARING PERSONAL BACKGROUND
4.8% 	 SEEKING EXPERIENCES
1.9% 	 SEEKING ADVICE
1.9% 	 SEEKING NORMALITY
1.0% 	 SHARING ADVICE
1.0% 	 SHARING INFORMATION
1.0% 	 META DISCUSSION
1.0% 	 SEEKING EMOTIONAL SUPPORT
1.0% 	 SHARING NORMALITY
1.0% 	 SHARING POSITIVE EMOTIONS


In [14]:
for _label, _texts in label_texts_dict.items():
    if _label == 'providing information (advice)':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

<br><br>

# Explore TWITTER posts

In [15]:
examples = db.get_dataset('discourse-twitter-posts')

print(len(examples))

100


In [16]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

24 	 META DISCUSSION
21 	 NONE
15 	 SHARING EXPERIENCES
15 	 SHARING INFORMATION
14 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7 	 SEEKING INFORMATION
5 	 SHARING CAUSAL REASONING / HYPOTHESIZING
5 	 SHARING FUTURE PLANS
5 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING NEGATIVE EMOTIONS
2 	 SHARING ADVICE
2 	 SHARING PERSONAL BACKGROUND
2 	 SEEKING EXPERIENCES


In [17]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
implant
--------------------------------
9 	 SHARING EXPERIENCES
6 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
5 	 SHARING INFORMATION
5 	 META DISCUSSION
4 	 SHARING CAUSAL REASONING / HYPOTHESIZING
4 	 SHARING FUTURE PLANS
3 	 SHARING NEGATIVE EMOTIONS
3 	 SEEKING INFORMATION
2 	 SHARING ADVICE
1 	 SHARING OPINIONS AND PREFERENCES
1 	 SEEKING EXPERIENCES

--------------------------------
iud
--------------------------------
10 	 META DISCUSSION
5 	 SHARING EXPERIENCES
4 	 SHARING OPINIONS AND PREFERENCES
3 	 SHARING INFORMATION
2 	 SHARING PERSONAL BACKGROUND
2 	 SEEKING INFORMATION
1 	 SHARING FUTURE PLANS
1 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
1 	 SEEKING EXPERIENCES
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING NEGATIVE EMOTIONS

--------------------------------
pill
--------------------------------
9 	 META DISCUSSION
7 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7 	 SHARING INFORMATION
2 	 SEEKING INFORMATION
1 	 SHARING EXPERIENCES



In [18]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', _label)

------------------------------
percent of posts with label
------------------------------

24.0% 	 META DISCUSSION
21.0% 	 NONE
15.0% 	 SHARING EXPERIENCES
15.0% 	 SHARING INFORMATION
14.0% 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
7.0% 	 SEEKING INFORMATION
5.0% 	 SHARING CAUSAL REASONING / HYPOTHESIZING
5.0% 	 SHARING FUTURE PLANS
5.0% 	 SHARING OPINIONS AND PREFERENCES
4.0% 	 SHARING NEGATIVE EMOTIONS
2.0% 	 SHARING ADVICE
2.0% 	 SHARING PERSONAL BACKGROUND
2.0% 	 SEEKING EXPERIENCES


In [19]:
for _label, _texts in label_texts_dict.items():
    if _label == 'providing other experiences':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

<br><br>

# Explore Twitter REPLIES

In [20]:
examples = db.get_dataset('discourse-twitter-replies')

print(len(examples))

109


In [21]:
label_count_dict = defaultdict(int)
method_label_count_dict = defaultdict(lambda: defaultdict(int))
label_texts_dict = defaultdict(list)
for e in examples:
    for _label in e['accept']:
        label_count_dict[_label] += 1
        method_label_count_dict[e['meta']['Method']][_label] += 1
        label_texts_dict[_label].append(e['text'])
    if len(e['accept']) < 1:
        label_count_dict['NONE'] += 1
        label_texts_dict['NONE'].append(e['text'])

print('------------------------------------------------------')
print('total number of posts labeled')
print('------------------------------------------------------')
print()
for _label, _count in sorted(label_count_dict.items(), key=lambda x: x[1], reverse=True):
    print(_count, '\t', _label)

------------------------------------------------------
total number of posts labeled
------------------------------------------------------

30 	 SHARING EXPERIENCES
26 	 META DISCUSSION
24 	 NONE
15 	 SHARING INFORMATION
8 	 SHARING OPINIONS AND PREFERENCES
4 	 SHARING PERSONAL BACKGROUND
3 	 SHARING FUTURE PLANS
3 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
3 	 SHARING ADVICE
2 	 SEEKING INFORMATION
2 	 SEEKING EXPERIENCES
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING NEGATIVE EMOTIONS


In [22]:
for _method, _label_count_dict in method_label_count_dict.items():
    print('--------------------------------')
    print(_method)
    print('--------------------------------')
    for _label, _count in sorted(_label_count_dict.items(), key=lambda x: x[1], reverse=True):
        print(_count, '\t', _label)
    print()

--------------------------------
pill
--------------------------------
18 	 META DISCUSSION
7 	 SHARING EXPERIENCES
2 	 SHARING INFORMATION
1 	 SHARING PERSONAL BACKGROUND
1 	 SHARING CAUSAL REASONING / HYPOTHESIZING
1 	 SHARING OPINIONS AND PREFERENCES
1 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
1 	 SEEKING INFORMATION
1 	 SEEKING EXPERIENCES

--------------------------------
iud
--------------------------------
8 	 META DISCUSSION
6 	 SHARING INFORMATION
6 	 SHARING EXPERIENCES
4 	 SHARING OPINIONS AND PREFERENCES
1 	 SHARING FUTURE PLANS
1 	 SEEKING EXPERIENCES
1 	 SHARING ADVICE
1 	 SHARING NEGATIVE EMOTIONS
1 	 SHARING PERSONAL BACKGROUND

--------------------------------
implant
--------------------------------
17 	 SHARING EXPERIENCES
7 	 SHARING INFORMATION
3 	 SHARING OPINIONS AND PREFERENCES
2 	 SHARING FUTURE PLANS
2 	 SHARING PERSONAL BACKGROUND
2 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
2 	 SHARING ADVICE
1 	 SEEKING INFORMATION



In [23]:
label_percent_dict = {_label: _count/float(len(examples)) for _label, _count in label_count_dict.items()}

print('------------------------------')
print('percent of posts with label')
print('------------------------------')
print()
for _label, _percent in sorted(label_percent_dict.items(), key=lambda x: x[1], reverse=True):
    print(str(round(_percent*100, 1)) + '%', '\t', _label)

------------------------------
percent of posts with label
------------------------------

27.5% 	 SHARING EXPERIENCES
23.9% 	 META DISCUSSION
22.0% 	 NONE
13.8% 	 SHARING INFORMATION
7.3% 	 SHARING OPINIONS AND PREFERENCES
3.7% 	 SHARING PERSONAL BACKGROUND
2.8% 	 SHARING FUTURE PLANS
2.8% 	 SHARING/DESCRIBING ADDITIONAL RESEARCH
2.8% 	 SHARING ADVICE
1.8% 	 SEEKING INFORMATION
1.8% 	 SEEKING EXPERIENCES
0.9% 	 SHARING CAUSAL REASONING / HYPOTHESIZING
0.9% 	 SHARING NEGATIVE EMOTIONS


In [24]:
for _label, _texts in label_texts_dict.items():
    if _label == 'rant':
        print('------------------------------------------')
        print(_label)
        print('------------------------------------------')
        print()
        for e in _texts:
            print(' '.join(e.split()))

<br><br>

# Backup labeling into a CSV

In [25]:
reddit_post_examples = db.get_dataset('discourse-reddit-posts')
reddit_comment_examples = db.get_dataset('discourse-reddit-comments')
twitter_post_examples = db.get_dataset('discourse-twitter-posts')
twitter_replies_examples = db.get_dataset('discourse-twitter-replies')

In [26]:
len(reddit_post_examples), len(reddit_comment_examples), len(twitter_post_examples), len(twitter_replies_examples)

(104, 0, 100, 109)

In [27]:
label_dicts = []
for e in reddit_post_examples + reddit_comment_examples + twitter_post_examples + twitter_replies_examples:
    for _label in e['accept']:
        label_dicts.append({'Source': e['meta']['Source'],
                            'ID': e['meta']['ID'],
                            'Label': _label,
                            'Text': e['text']})
label_df = pd.DataFrame(label_dicts)

In [28]:
len(label_df)

315

In [29]:
label_df.sample(3)

Unnamed: 0,Source,ID,Label,Text
82,reddit-posts,j3ajl2,SHARING/DESCRIBING ADDITIONAL RESEARCH,She thinks it might be GI related...
121,twitter-posts,2972858716258304,SHARING EXPERIENCES,I could feel it under her arm skin =/
81,reddit-posts,6gzy7v,SEEKING INFORMATION,"Like how long to go without sex after, if taki..."


In [None]:
label_df.to_csv('/Volumes/Passport-1/data/birth-control/labeling/label-sentences/labeled_by_leann.all.csv')