In [1]:
from collections import defaultdict
import random

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

In [3]:
maria_path         = data_directory_path + '/labeling/labeled_by_maria.all.csv'
leann_reddit_path  = data_directory_path + '/labeling/labeled_by_leann.reddit-all.csv'
leann_twitter_path = data_directory_path + '/labeling/labeled_by_leann.twitter-all.csv'
leann_extra_path   = data_directory_path + '/labeling/labeled_by_leann.reddit_extra_examples.csv'
roz_path           = data_directory_path + '/labeling/labeled_by_roz.all.csv'

<br><br>

# Load labeled data

## MARIA

In [4]:
maria_df = pd.read_csv(maria_path)

In [5]:
len(maria_df.index)

2570

In [6]:
maria_df['Labeler'] = 'Maria'

In [7]:
def rename_source(text):
    if text == 'reddit':
        return 'reddit-posts'
    return text

maria_df['Source'] = maria_df['Source'].apply(rename_source)

In [8]:
maria_df['Source'].value_counts()

reddit-comments    711
reddit-posts       696
twitter-replies    594
twitter-posts      569
Name: Source, dtype: int64

In [9]:
maria_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
812,812,reddit-comments,dnnjkyj,PROVIDING EXPERIENCES,I just literally got Mirena today and my nexpl...,Maria
1069,1069,reddit-comments,erp3kie,PROVIDING EXPERIENCES,I am on both. My clinic doc said they would wo...,Maria
903,903,reddit-comments,euzcfix,PROVIDING INFORMATIONAL SUPPORT,That sounds awful! I know that copper IUDs are...,Maria


## LEANN

In [10]:
leann_df = pd.concat([pd.read_csv(leann_reddit_path), pd.read_csv(leann_twitter_path), pd.read_csv(leann_extra_path)])

In [11]:
len(leann_df.index)

2459

In [12]:
leann_df['Labeler'] = 'LeAnn'

In [13]:
leann_df['Source'] = leann_df['Source'].apply(rename_source)

In [14]:
leann_df['Source'].value_counts()

reddit-posts       666
reddit-comments    621
twitter-replies    621
twitter-posts      551
Name: Source, dtype: int64

In [15]:
leann_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
1006,1006,reddit-comments,dtu9z3p,PROVIDING EXPERIENCES,Damn beat me to it\n\nSo I can only vouch a-sp...,LeAnn
719,719,reddit-comments,ffthgin,PROVIDING EXPERIENCES,I supposed it’s possible but I’d think unlikel...,LeAnn
589,589,twitter-replies,957870938338611200,DISCOURSE,@FireCap203 @SlythSeeker2017 @DaddyLou13 @John...,LeAnn


## ROZ

In [16]:
roz_df = pd.read_csv(roz_path)

In [17]:
len(roz_df.index)

1617

In [18]:
roz_df['Labeler'] = 'Roz'

In [19]:
roz_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
462,462,reddit-posts,4se0en,SEEKING INFORMATIONAL SUPPORT,[TITLE: skyla iud moderate cramping and bleedi...,Roz
1073,1073,twitter-posts,804534043438174200,PROVIDING INFORMATIONAL SUPPORT,#IUD appointments up 82% after Trump's win. Re...,Roz
190,190,reddit-posts,60jyh8,SEEKING INFORMATIONAL SUPPORT,[TITLE: implantation bleeding??] \n\nOkay so l...,Roz


In [20]:
roz_df['Source'].value_counts()

reddit-posts       487
reddit-comments    447
twitter-replies    344
twitter-posts      339
Name: Source, dtype: int64

## COMBINED

In [21]:
combined_df = pd.concat([maria_df, leann_df, roz_df])
len(combined_df.index)

6646

In [22]:
combined_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
1776,1776,twitter-posts,224376666414260220,DISCOURSE,Urgh....Sitting in this airport has reminded m...,Maria
423,423,twitter-posts,3686275807,DISCOURSE,Thinking about getting the Mirena IUD or alrea...,LeAnn
1502,1502,twitter-posts,319207933743333400,PROVIDING EXPERIENCES,Implanon is the best birth control ever!!! Not...,Maria


In [23]:
combined_df['Label'].value_counts(normalize=True)

PROVIDING EXPERIENCES              0.252784
PROVIDING INFORMATIONAL SUPPORT    0.225399
DISCOURSE                          0.177550
SEEKING INFORMATIONAL SUPPORT      0.157087
SEEKING EXPERIENCES                0.089377
SEEKING EMOTIONAL SUPPORT          0.064851
PROVIDING EMOTIONAL SUPPORT        0.032952
Name: Label, dtype: float64

In [24]:
combined_df['ID'] = combined_df['ID'].astype(str)

In [26]:
combined_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
288,288,reddit-posts,hq2x0o,SEEKING EXPERIENCES,"[TITLE: hello, i am looking for advice with th...",Maria
577,577,reddit-posts,4qaku3,SEEKING INFORMATIONAL SUPPORT,[TITLE: brown spotting with copper iud - is it...,Maria
486,486,reddit-comments,exeejsv,PROVIDING EMOTIONAL SUPPORT,You should take it easy for the first week at ...,Roz


In [27]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in combined_df.iterrows():
    id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
    id_data_dict[r['ID']] = r

dicts_for_comparison = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _dict = {'ID': _id,
             'Source': id_data_dict[_id]['Source'],
             'Text': id_data_dict[_id]['Text']}
    for _labeler, _labels in _labeler_labels_dict.items():
        _dict[_labeler] = _labels
    dicts_for_comparison.append(_dict)
df_for_comparison = pd.DataFrame(dicts_for_comparison)

df_for_comparison.sample(5)

Unnamed: 0,ID,Source,Text,Maria,LeAnn,Roz
1092,403249844200177660,twitter-posts,I'm so nervous to get this nexplanon put it 😰,[SEEKING EMOTIONAL SUPPORT],"[DISCOURSE, SEEKING EMOTIONAL SUPPORT]",[SEEKING EMOTIONAL SUPPORT]
1479,329387870299246600,twitter-posts,VOA Literary News: FDA Makes Contraceptive Pil...,[PROVIDING INFORMATIONAL SUPPORT],[PROVIDING INFORMATIONAL SUPPORT],
658,e3t0jvx,reddit-comments,If the IUD isn't sitting in the fundus(near th...,[PROVIDING INFORMATIONAL SUPPORT],[PROVIDING INFORMATIONAL SUPPORT],[PROVIDING INFORMATIONAL SUPPORT]
111,a7w4d5,reddit-posts,[TITLE: nexplanon &amp; mental health] \n\nI'v...,[PROVIDING EXPERIENCES],[PROVIDING EXPERIENCES],"[SEEKING EMOTIONAL SUPPORT, PROVIDING EXPERIEN..."
1622,59009287917281280,twitter-replies,"@bellawrites Yes, Mirena is an IUD which also ...",[PROVIDING INFORMATIONAL SUPPORT],[PROVIDING INFORMATIONAL SUPPORT],[PROVIDING INFORMATIONAL SUPPORT]


In [28]:
df_for_comparison.to_csv(data_directory_path + '/labeling/labels_for_comparison.csv')

<br><br>

# Compare labels for REDDIT POSTS

In [271]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']

In [272]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']

In [273]:
len(id_text_dict)

502

In [274]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

301

In [275]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

301

In [276]:
disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if sorted(_maria_labels) != sorted(_leann_labels):
        disagreements.append(_id)
len(disagreements)

105

In [277]:
disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _roz_labels = _labeler_labels_dict['Roz']
    if sorted(_maria_labels) != sorted(_roz_labels):
        disagreements.append(_id)
len(disagreements)

118

In [278]:
for _id in disagreements:
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('Roz:', sorted(id_labeler_labels_dict[_id]['Roz']))
    print()

ch4aux
Maria: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING INFORMATIONAL SUPPORT']

b9f1dj
Maria: ['SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING EXPERIENCES', 'SEEKING INFORMATIONAL SUPPORT']

8d8u3h
Maria: ['SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING INFORMATIONAL SUPPORT']

fhewxq
Maria: ['SEEKING EXPERIENCES', 'SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING EXPERIENCES', 'SEEKING INFORMATIONAL SUPPORT']

6k0fpw
Maria: ['SEEKING INFORMATIONAL SUPPORT']
Roz: ['PROVIDING EXPERIENCES', 'SEEKING INFORMATIONAL SUPPORT']

icyygr
Maria: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING INFORMATIONAL SUPPORT']

d4auko
Maria: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING EXPERIENCES']
Roz: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING EXPERIENCES', 'SEEKING INFORMATIONAL SUPPORT']

hrp6tb
Maria: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING EMOTIONAL S

In [279]:
id_text_dict['96ldyj']

'[TITLE: i\'ve been keeping nuvaring in for 4 weeks at a time to prevent "period"/withdrawal bleeding - am i reducing effectiveness?] \n\nHey guys, I know I should speak to my gyno at my next appointment about this, but I was wondering if anyone here would have any info in the meantime. \n\nThe past few months I have not been taking my Nuvaring out after 3 weeks, but instead leaving it in for 4 and then immediately putting in a new one  to prevent withdrawal bleeding. It has been awesome lol.\n\nMy one concern is if I\'m reducing its effectiveness. Intuitively it seems like it would not effect it at all (similar to taking back-to-back oral contraceptive), but I was wondering if anyone here has a definitive answer on this. \n\nI\'ve lurked here a while - this really is a great resource. TAI for any info!'

In [280]:
complete_disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _roz_labels = _labeler_labels_dict['Roz']
    if not any(_label in _roz_labels for _label in _maria_labels):
        complete_disagreements.append(_id)
len(complete_disagreements)

3

In [281]:
for _id in complete_disagreements:
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('Roz:', sorted(id_labeler_labels_dict[_id]['Roz']))
    print()

bjlpuv
Maria: ['SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING EXPERIENCES']

eetc19
Maria: ['SEEKING EXPERIENCES']
Roz: ['SEEKING INFORMATIONAL SUPPORT']

amtxdt
Maria: ['SEEKING INFORMATIONAL SUPPORT']
Roz: ['SEEKING EXPERIENCES']



In [282]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

301

In [283]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1jsiij
['SEEKING INFORMATIONAL SUPPORT', 'SEEKING EXPERIENCES']
[TITLE: starting/getting birth control online with online health center?] 

New to all things birth control. Well, except condoms. Which my bf and I have been using up until this point. However, we both would like to ditch them or at least go without them once in awhile so I'm looking into birth control and ways to obtain it.  My preferred choice would be those ones implanted in the arm that I've read about, but due to expenses and being a poor college student at the moment... the pill seems as though the better solution to go for now. 

So being that I'll be at college without a mode of transportation and just overall being a socially awkward person that doesn't like to go to new places and talk to strangers all that often... I was quite intrigued in finding out Planned Parenthood has an Online Health Center and I was wondering if anyone here uses this or can give me any and all information about it? 

-If one has never g

In [284]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'reddit-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.reddit_posts.csv')

In [285]:
final_df.sample(3)

Unnamed: 0,ID,Label,Text,Source,Decision
274,7ss2cp,SEEKING EXPERIENCES,[TITLE: those who have switched from the patch...,reddit-posts,two or more
161,a7w4d5,PROVIDING EXPERIENCES,[TITLE: nexplanon &amp; mental health] \n\nI'v...,reddit-posts,two or more
238,3gt02l,SEEKING INFORMATIONAL SUPPORT,[TITLE: stopping period day of start by taking...,reddit-posts,two or more


In [286]:
maria_labels = []
leann_labels = []
for _id, _maria_labels in maria_id_labels_dict.items():
    _maria_labels = ['_'.join(_label.split()) for _label in _maria_labels]
    _leann_labels = ['_'.join(_label.split()) for _label in leann_id_labels_dict[_id]]
    maria_labels.append(' '.join(_maria_labels))
    leann_labels.append(' '.join(_leann_labels))

vectorizer = CountVectorizer().fit(maria_labels)
maria_Y = vectorizer.transform(maria_labels)
leann_Y = vectorizer.transform(leann_labels)
label_names = vectorizer.get_feature_names()

print(maria_Y.shape, leann_Y.shape)

print(classification_report(maria_Y, leann_Y, target_names=label_names))

(500, 7) (500, 7)
                                 precision    recall  f1-score   support

                      discourse       0.20      0.25      0.22         4
    providing_emotional_support       1.00      0.25      0.40         4
          providing_experiences       0.83      0.94      0.88        32
providing_informational_support       1.00      0.40      0.57         5
      seeking_emotional_support       0.84      0.48      0.61       123
            seeking_experiences       0.74      0.89      0.81       170
  seeking_informational_support       0.93      0.90      0.92       355

                      micro avg       0.85      0.82      0.83       693
                      macro avg       0.79      0.59      0.63       693
                   weighted avg       0.86      0.82      0.83       693
                    samples avg       0.89      0.86      0.85       693



In [287]:
maria_labels = []
roz_labels = []
for _id, _roz_labels in roz_id_labels_dict.items():
    _maria_labels = ['_'.join(_label.split()) for _label in maria_id_labels_dict[_id]]
    _roz_labels = ['_'.join(_label.split()) for _label in _roz_labels]
    maria_labels.append(' '.join(_maria_labels))
    roz_labels.append(' '.join(_roz_labels))

vectorizer = CountVectorizer().fit(maria_labels)
maria_Y = vectorizer.transform(maria_labels)
roz_Y = vectorizer.transform(roz_labels)
label_names = vectorizer.get_feature_names()

print(maria_Y.shape, roz_Y.shape)

print(classification_report(maria_Y, roz_Y, target_names=label_names))

(300, 7) (300, 7)
                                 precision    recall  f1-score   support

                      discourse       0.25      0.25      0.25         4
    providing_emotional_support       0.40      0.67      0.50         3
          providing_experiences       0.79      0.96      0.87        27
providing_informational_support       0.80      0.80      0.80         5
      seeking_emotional_support       0.59      0.82      0.69        77
            seeking_experiences       0.72      0.95      0.82        96
  seeking_informational_support       0.94      0.95      0.94       203

                      micro avg       0.78      0.92      0.84       415
                      macro avg       0.64      0.77      0.70       415
                   weighted avg       0.80      0.92      0.85       415
                    samples avg       0.84      0.94      0.86       415



<br><br>

<br><br>

# Get majority labels for REDDIT COMMENTS

In [288]:
target_df = combined_df[combined_df['Source'] == 'reddit-comments']

In [289]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

553

In [290]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

345

In [291]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

345

In [292]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

342

In [293]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

exeejsv
['PROVIDING INFORMATIONAL SUPPORT', 'PROVIDING EXPERIENCES', 'PROVIDING EMOTIONAL SUPPORT']
You should take it easy for the first week at least, you'll have bruising and it'll be tender. 

Just go with your body! I've had mine for almost 3 months and I don't notice it at all, the insertion mark is a tiny dot on my arm. Unless you were actively looking/feeling for it - you'd never know!
ew0eara
['PROVIDING EXPERIENCES']
It didn't feel good, but I'd do it again. It was a 9 on the pain scale, but it only lasted 5 seconds. When the sound was in, it was like a light cramp that got more intense. Nothing I hadn't felt before from normal menstrual cramps. Then during the insertion of the IUD, the pain got worse. It was at it's height for 5 seconds, and I put my jacket over my mouth because I was going "owwwwWWWW." Then my doc said "you're all done!" and helped me get up, it was immediate relief of pain. The recovery was pretty easy and well worth it. It was very fast, I was probably on

In [294]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'reddit-comments',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.reddit_comments.csv')

<br><br>

# Get majority labels for TWITTER POSTS

In [295]:
target_df = combined_df[combined_df['Source'] == 'twitter-posts']

In [296]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

497

In [297]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

292

In [298]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

292

In [299]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

285

In [300]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1266030580036259800
['SEEKING EMOTIONAL SUPPORT']
I want the IUD so bad but I’m scared 🥺🙄
1215895024107507700
['DISCOURSE']
i cannot believe there’s a non-hormonal oral contraceptive that’s been on the market for thirty YEARS but it’s only available in india. this world is a sick joke
1208819982840344600
['SEEKING EXPERIENCES']
Hey ladies, Do any of y’all have an IUD and use a menstrual cup?
155399192255283200
['DISCOURSE']
My cousin's wife just told me about a friend who was injured when an "IUD" exploded in Afghanistan.
392974329991790600
['DISCOURSE']
I said I have the birth control implant in my arm and Petty Officer Burdette is like "what's that have to do anything w/your vagina?!"
964617929894527000
['PROVIDING INFORMATIONAL SUPPORT']
Check out this interesting factoid from our friends at the Oral Contraception Over the Counter Coalition! #FreeThePill https://t.co/nncjs2Abe9
248648850909315070
['SEEKING INFORMATIONAL SUPPORT']
So has anyone heard about the Mirena IUD Recall???
49

In [301]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'twitter-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.twitter_posts.csv')

<br><br>

# Get majority labels for TWITTER COMMENTS

In [302]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

In [303]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

502

In [304]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

284

In [305]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

284

In [306]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

283

In [307]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1306801969118445600
['PROVIDING EXPERIENCES']
@roxiqt Had severe panic attacks after getting an IUD so I had to have it removed, was on hormonal bc for so long that I had swelling of the brain, and now I get a shot every 3 months so a man can cum in me… fuck being female
473233002987020300
['PROVIDING INFORMATIONAL SUPPORT']
@NinoDemayo plan b is hormones, usually a higher dosage than in birth control, if it's toxic after a while birth control pill wld b deadly
1160981232349528000
['PROVIDING EXPERIENCES']
@IvetteBrianna_ it’s soo convenient! i’ve been on it for a few months now. it’s called the depo shot. i didn’t want an IUD or implant and i’m too forgetful for the pills lol
664898788327206900
['DISCOURSE']
.@absmom1 I do know that. You cannot force every woman in the country to take a hormonal pill/get an IUD against her will or her family's
1223701281417695200
['PROVIDING EXPERIENCES']
Anyways, I’m coming up on my IUD’s third birthday next month and I’d like to report that I have s

In [308]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'twitter-replies',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.twitter_replies.csv')

<br><br>

# Calculate agreement

In [309]:
import nltk
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance

In [310]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_df['Label'].value_counts()

SEEKING INFORMATIONAL SUPPORT      908
SEEKING EXPERIENCES                502
SEEKING EMOTIONAL SUPPORT          301
PROVIDING EXPERIENCES              102
DISCOURSE                           13
PROVIDING INFORMATIONAL SUPPORT     12
PROVIDING EMOTIONAL SUPPORT         11
Name: Label, dtype: int64

In [311]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_labels = ['SEEKING INFORMATIONAL SUPPORT', 'SEEKING EXPERIENCES', 'SEEKING EMOTIONAL SUPPORT', 'PROVIDING EXPERIENCES']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6169686157184409

In [312]:
target_df = combined_df[combined_df['Source'] == 'reddit-comments']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6221652101674353

In [313]:
target_df = combined_df[combined_df['Source'] == 'twitter-posts']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6007451355858668

In [314]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6241422358848049

In [202]:
import pingouin as pg

In [203]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

labels_dicts = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        labels_dicts.append({'Labeler': _labeler,
                             'Item': _id,
                             'Labels': _labels})
labels_df = pd.DataFrame(labels_dicts)

icc = pg.intraclass_corr(data=labels_df, 
                         targets='Item', 
                         raters='Labeler',
                         ratings='Labels')

icc

DataError: No numeric types to aggregate