In [1]:
from collections import defaultdict
import random

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
data_directory_path   = '/Volumes/Passport-1/data/birth-control'
output_directory_path = '/Volumes/Passport-1/output/birth-control'

In [3]:
maria_path = data_directory_path + '/labeling/label-sentences/labeled_by_maria.all.csv'
leann_path = data_directory_path + '/labeling/label-sentences-leann/labeled_by_leann.all.csv'

<br><br>

# Load labeled data

## MARIA

In [4]:
maria_df = pd.read_csv(maria_path)

In [5]:
len(maria_df.index)

895

In [6]:
maria_df['Labeler'] = 'Maria'

In [7]:
def rename_source(text):
    if text == 'reddit':
        return 'reddit-posts'
    return text

maria_df['Source'] = maria_df['Source'].apply(rename_source)

In [8]:
maria_df['Source'].value_counts()

reddit-posts       239
twitter-replies    224
reddit-comments    219
twitter-posts      213
Name: Source, dtype: int64

In [9]:
maria_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
98,98,reddit-posts,bbkgb9,narrating personal experiences,"I’ve been feeling weak, probably because of th...",Maria
97,97,reddit-posts,4d4caa,seeking experiences,Has anyone else had this experience?,Maria
751,751,twitter-replies,1130183302294442000,NONE,"If you don’t want to get pregnant , 1.",Maria


## LEANN

In [10]:
leann_df = pd.read_csv(leann_path)

In [11]:
len(leann_df.index)

571

In [12]:
leann_df['Labeler'] = 'LeAnn'

In [13]:
leann_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
116,116,reddit-comments,d518hd9,providing personal experiences,I've had mine for 4 months and in the beginnin...,LeAnn
302,302,webmd-reviews,w9678,providing information (educational),Bayer is claiming this is a known risk.,LeAnn
522,522,twitter-replies,126397201638363140,providing information (educational),The annoluvant (?) birth control pill is also ...,LeAnn


In [14]:
leann_df['Source'].value_counts()

twitter-replies    122
reddit-comments    118
webmd-reviews      115
reddit-posts       109
twitter-posts      107
Name: Source, dtype: int64

## COMBINED

In [41]:
combined_df = pd.concat([maria_df, leann_df])
len(combined_df.index)

1466

In [42]:
combined_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
3,3,reddit-posts,j0cfeu,narrating personal experiences,The gap isn’t too big and no arm pain either.,Maria
455,455,reddit-comments,dv14oa9,NONE,"agreed, sorry I wasn’t sure if you meant anoth...",Maria
42,42,reddit-posts,irfgws,negative self-disclosure,"I lost all my confidence, had zero libido, and...",Maria


In [43]:
combined_df['Label'].value_counts(normalize=True)

narrating personal experiences         0.345839
providing information (educational)    0.146658
NONE                                   0.082538
other discourse                        0.079127
seeking experiences                    0.047067
seeking information (educational)      0.045703
negative self-disclosure               0.043656
providing information (advice)         0.038881
weighing options                       0.028649
humor                                  0.026603
providing other experiences            0.023192
providing personal experiences         0.021828
seeking information (advice)           0.019782
positive self-disclosure               0.018417
politics                               0.017053
providing emotional support            0.008186
seeking emotional support              0.006821
Name: Label, dtype: float64

In [44]:
combined_df['ID'] = combined_df['ID'].astype(str)

In [45]:
combined_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
336,336,webmd-reviews,w10937,narrating personal experiences,I haven't had a period in a little over a year...,LeAnn
264,264,webmd-reviews,w4533,narrating personal experiences,I've had it for a year and a half and have had...,LeAnn
293,293,reddit-comments,dy4zbsq,NONE,Long periods with nothing (max for me maybe 6 ...,Maria


In [39]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in combined_df.iterrows():
    id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
    id_data_dict[r['ID']] = r

dicts_for_comparison = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _dict = {'ID': _id,
                'Source': id_data_dict[_id]['Source'],
                'Text': id_data_dict[_id]['Text']}
    for _labeler, _labels in _labeler_labels_dict.items():
        _dict[_labeler] = _labels
    dicts_for_comparison.append(_dict)
df_for_comparison = pd.DataFrame(dicts_for_comparison)

df_for_comparison.sample(5)

Unnamed: 0,ID,Source,Text,Maria,LeAnn
463,317670432931196900,twitter-posts,My wife just got a birth control implant that ...,[providing other experiences],[humor]
213,fabpmiw,reddit-comments,The first 2 weeks or so I had a bit of cystic ...,"[negative self-disclosure, narrating personal ...",[narrating personal experiences]
820,1235942125847502800,twitter-replies,the liberal obamacare terrorists hand out IUD'...,[politics],"[politics, other discourse]"
866,1235942125847502800,twitter-replies,the liberal obamacare terrorists hand out IUD'...,[politics],"[politics, other discourse]"
156,hy7n90,reddit-posts,When will it stop?,[seeking information (educational)],"[seeking emotional support, seeking informatio..."


In [47]:
print(len(df_for_comparison.index))
df_for_comparison = df_for_comparison.dropna()
len(df_for_comparison.index)

900


900

In [53]:
df_for_comparison.sample(5)

Unnamed: 0,ID,Source,Text,Maria,LeAnn
631,1316159727051698200,twitter-replies,"Your party is so extremist, you'd even ban the...",[politics],[politics]
725,1255582428523094000,twitter-replies,I use the Nexplanon and haven’t had any issues...,[narrating personal experiences],[narrating personal experiences]
679,1210782062543700000,twitter-replies,Omg I was on Nexplanon and I HAD to get off as...,[narrating personal experiences],[narrating personal experiences]
215,feyw642,reddit-comments,"But, as always, please call your doctor to ver...",[providing information (advice)],[providing information (advice)]
61,dxulro,reddit-posts,"Also, if I end up getting it out would I be li...","[weighing options, seeking information (educat...",[seeking information (educational)]


In [54]:
df_for_comparison.to_csv(data_directory_path + '/labeling/label-sentences/labels_for_comparison.csv')

<br><br>

# Get majority labels for REDDIT POSTS

In [60]:
# target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_df = df_for_comparison[df_for_comparison['Source'] == 'reddit-posts']

In [73]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in target_df.iterrows():
    id_labeler_labels_dict[r['ID']]['Maria'] = sorted(r['Maria'])
    id_labeler_labels_dict[r['ID']]['LeAnn'] = sorted(r['LeAnn'])
    id_data_dict[r['ID']] = r

len(id_labeler_labels_dict)

100

In [61]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']

In [62]:
len(id_text_dict)

100

In [74]:
disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if sorted(_maria_labels) != sorted(_leann_labels):
        disagreements.append(_id)
len(disagreements)

36

In [76]:
for _id in random.sample(disagreements, 10):
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('Leann:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
    print()

g7t3b4
Maria: ['narrating personal experiences', 'weighing options']
Leann: ['weighing options']

c9cyb4
Maria: ['narrating personal experiences']
Leann: ['other discourse']

5bpgso
Maria: ['narrating personal experiences', 'negative self-disclosure']
Leann: ['narrating personal experiences']

4vuprz
Maria: ['narrating personal experiences', 'negative self-disclosure', 'seeking emotional support']
Leann: ['seeking emotional support']

i10pmi
Maria: ['narrating personal experiences']
Leann: ['negative self-disclosure', 'seeking emotional support']

c2gb30
Maria: ['seeking information (advice)']
Leann: ['seeking information (educational)']

hy7n90
Maria: ['seeking information (educational)']
Leann: ['seeking emotional support', 'seeking information (educational)']

98qfc0
Maria: ['narrating personal experiences', 'negative self-disclosure']
Leann: ['narrating personal experiences']

ejn5r6
Maria: ['narrating personal experiences']
Leann: ['other discourse']

afba7b
Maria: ['narrating per

In [279]:
id_text_dict['96ldyj']

'[TITLE: i\'ve been keeping nuvaring in for 4 weeks at a time to prevent "period"/withdrawal bleeding - am i reducing effectiveness?] \n\nHey guys, I know I should speak to my gyno at my next appointment about this, but I was wondering if anyone here would have any info in the meantime. \n\nThe past few months I have not been taking my Nuvaring out after 3 weeks, but instead leaving it in for 4 and then immediately putting in a new one  to prevent withdrawal bleeding. It has been awesome lol.\n\nMy one concern is if I\'m reducing its effectiveness. Intuitively it seems like it would not effect it at all (similar to taking back-to-back oral contraceptive), but I was wondering if anyone here has a definitive answer on this. \n\nI\'ve lurked here a while - this really is a great resource. TAI for any info!'

In [77]:
complete_disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _roz_labels = _labeler_labels_dict['LeAnn']
    if not any(_label in _roz_labels for _label in _maria_labels):
        complete_disagreements.append(_id)
len(complete_disagreements)

14

In [78]:
for _id in complete_disagreements:
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('LeAnn:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
    print()

c9cyb4
Maria: ['narrating personal experiences']
LeAnn: ['other discourse']

1gnud9
Maria: ['seeking experiences', 'weighing options']
LeAnn: ['seeking emotional support']

872sqt
Maria: ['seeking information (advice)']
LeAnn: ['seeking information (educational)']

c2gb30
Maria: ['seeking information (advice)']
LeAnn: ['seeking information (educational)']

1jqaak
Maria: ['seeking experiences']
LeAnn: ['seeking emotional support']

jigegd
Maria: ['seeking information (advice)']
LeAnn: ['seeking information (educational)']

f8iqrv
Maria: ['providing other experiences']
LeAnn: ['narrating personal experiences']

eiqkoq
Maria: ['seeking information (advice)']
LeAnn: ['seeking information (educational)']

cumfon
Maria: ['narrating personal experiences']
LeAnn: ['other discourse']

cqenox
Maria: ['narrating personal experiences']
LeAnn: ['other discourse']

i10pmi
Maria: ['narrating personal experiences']
LeAnn: ['negative self-disclosure', 'seeking emotional support']

ejn5r6
Maria: ['narra

In [79]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

86

In [81]:
for _id, _labels in random.sample(id_majority_labels_dict.items(), 10):
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

hx79t9
['narrating personal experiences']
I started my period yesterday and all was normal.
7ccllo
['seeking information (advice)']
Not a lot, but I was wondering what I should do/how should I take care of this since I'm going to be come from my home town for a few days.
irii49
['narrating personal experiences']
I took the pill after the 7 days of stop eight days ago.
87onga
['seeking experiences']
but I’d just like to seek opinions from other women for my sake of mind.
16djlu
['narrating personal experiences']
I can't just go to a doctor's to find out because I am not a resident yet and it would cost me a lot of money, but it is becoming a big concern so if I must go to the doctor then I will.
f0rsrk
['providing information (educational)']
So for most bc pills, you take it for so many days, then have a period, well, period.
715ctl
['narrating personal experiences']
However, since those first 6 weeks I've had an almost non-stop period--not spotting--full on period.
hy7n90
['seeking inf

In [82]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'reddit-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/label-sentences/majority.reddit_posts.csv')

In [83]:
final_df.sample(3)

Unnamed: 0,ID,Label,Text,Source,Decision
37,ch4ef1,seeking information (advice),"If it is, should I try using boric acid pills?",reddit-posts,two or more
68,d48krb,seeking information (educational),My luggage with my minipills inside was put in...,reddit-posts,two or more
5,dwss7a,narrating personal experiences,"As a pretext I'm in my mid 30s, married, don't...",reddit-posts,two or more


In [98]:
maria_labels = target_df['Maria'].tolist()
leann_labels = target_df['LeAnn'].tolist()

maria_labels = ['_'.join(_label[0].split()) for _label in maria_labels]
leann_labels = ['_'.join(_label[0].split()) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('-')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('-')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('(')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('(')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split(')')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split(')')) for _label in leann_labels]

# for _id, _maria_labels in maria_id_labels_dict.items():
#     _maria_labels = ['_'.join(_label.split()) for _label in _maria_labels]
#     _leann_labels = ['_'.join(_label.split()) for _label in leann_id_labels_dict[_id]]
#     maria_labels.append(' '.join(_maria_labels))
#     leann_labels.append(' '.join(_leann_labels))

vectorizer = CountVectorizer().fit(maria_labels)
maria_Y = vectorizer.transform(maria_labels)
leann_Y = vectorizer.transform(leann_labels)
label_names = vectorizer.get_feature_names()

print(maria_Y.shape, leann_Y.shape)

print(classification_report(maria_Y, leann_Y, target_names=label_names))

(200, 10) (200, 10)
                                     precision    recall  f1-score   support

     narrating_personal_experiences       0.94      0.77      0.85        66
           negative_self_disclosure       1.00      0.67      0.80         9
                    other_discourse       0.09      1.00      0.17         1
providing_information__educational_       1.00      1.00      1.00         1
        providing_other_experiences       0.00      0.00      0.00         1
          seeking_emotional_support       0.01      1.00      0.03         1
                seeking_experiences       0.90      0.95      0.92        19
       seeking_information__advice_       0.88      0.50      0.64        14
  seeking_information__educational_       0.48      0.13      0.20        77
                   weighing_options       1.00      0.27      0.43        11

                          micro avg       0.49      0.49      0.49       200
                          macro avg       0.63      0.

<br><br>

<br><br>

# Get majority labels for REDDIT COMMENTS

In [288]:
target_df = combined_df[combined_df['Source'] == 'reddit-comments']

In [289]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

553

In [290]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

345

In [291]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

345

In [292]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

342

In [293]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

exeejsv
['PROVIDING INFORMATIONAL SUPPORT', 'PROVIDING EXPERIENCES', 'PROVIDING EMOTIONAL SUPPORT']
You should take it easy for the first week at least, you'll have bruising and it'll be tender. 

Just go with your body! I've had mine for almost 3 months and I don't notice it at all, the insertion mark is a tiny dot on my arm. Unless you were actively looking/feeling for it - you'd never know!
ew0eara
['PROVIDING EXPERIENCES']
It didn't feel good, but I'd do it again. It was a 9 on the pain scale, but it only lasted 5 seconds. When the sound was in, it was like a light cramp that got more intense. Nothing I hadn't felt before from normal menstrual cramps. Then during the insertion of the IUD, the pain got worse. It was at it's height for 5 seconds, and I put my jacket over my mouth because I was going "owwwwWWWW." Then my doc said "you're all done!" and helped me get up, it was immediate relief of pain. The recovery was pretty easy and well worth it. It was very fast, I was probably on

In [294]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'reddit-comments',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.reddit_comments.csv')

<br><br>

# Get majority labels for TWITTER POSTS

In [295]:
target_df = combined_df[combined_df['Source'] == 'twitter-posts']

In [296]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

497

In [297]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

292

In [298]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

292

In [299]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

285

In [300]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1266030580036259800
['SEEKING EMOTIONAL SUPPORT']
I want the IUD so bad but I’m scared 🥺🙄
1215895024107507700
['DISCOURSE']
i cannot believe there’s a non-hormonal oral contraceptive that’s been on the market for thirty YEARS but it’s only available in india. this world is a sick joke
1208819982840344600
['SEEKING EXPERIENCES']
Hey ladies, Do any of y’all have an IUD and use a menstrual cup?
155399192255283200
['DISCOURSE']
My cousin's wife just told me about a friend who was injured when an "IUD" exploded in Afghanistan.
392974329991790600
['DISCOURSE']
I said I have the birth control implant in my arm and Petty Officer Burdette is like "what's that have to do anything w/your vagina?!"
964617929894527000
['PROVIDING INFORMATIONAL SUPPORT']
Check out this interesting factoid from our friends at the Oral Contraception Over the Counter Coalition! #FreeThePill https://t.co/nncjs2Abe9
248648850909315070
['SEEKING INFORMATIONAL SUPPORT']
So has anyone heard about the Mirena IUD Recall???
49

In [301]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'twitter-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.twitter_posts.csv')

<br><br>

# Get majority labels for TWITTER COMMENTS

In [302]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

In [303]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

502

In [304]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

284

In [305]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

284

In [306]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

283

In [307]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1306801969118445600
['PROVIDING EXPERIENCES']
@roxiqt Had severe panic attacks after getting an IUD so I had to have it removed, was on hormonal bc for so long that I had swelling of the brain, and now I get a shot every 3 months so a man can cum in me… fuck being female
473233002987020300
['PROVIDING INFORMATIONAL SUPPORT']
@NinoDemayo plan b is hormones, usually a higher dosage than in birth control, if it's toxic after a while birth control pill wld b deadly
1160981232349528000
['PROVIDING EXPERIENCES']
@IvetteBrianna_ it’s soo convenient! i’ve been on it for a few months now. it’s called the depo shot. i didn’t want an IUD or implant and i’m too forgetful for the pills lol
664898788327206900
['DISCOURSE']
.@absmom1 I do know that. You cannot force every woman in the country to take a hormonal pill/get an IUD against her will or her family's
1223701281417695200
['PROVIDING EXPERIENCES']
Anyways, I’m coming up on my IUD’s third birthday next month and I’d like to report that I have s

In [308]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'twitter-replies',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.twitter_replies.csv')

<br><br>

# Calculate agreement

In [309]:
import nltk
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance

In [310]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_df['Label'].value_counts()

SEEKING INFORMATIONAL SUPPORT      908
SEEKING EXPERIENCES                502
SEEKING EMOTIONAL SUPPORT          301
PROVIDING EXPERIENCES              102
DISCOURSE                           13
PROVIDING INFORMATIONAL SUPPORT     12
PROVIDING EMOTIONAL SUPPORT         11
Name: Label, dtype: int64

In [311]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_labels = ['SEEKING INFORMATIONAL SUPPORT', 'SEEKING EXPERIENCES', 'SEEKING EMOTIONAL SUPPORT', 'PROVIDING EXPERIENCES']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6169686157184409

In [312]:
target_df = combined_df[combined_df['Source'] == 'reddit-comments']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6221652101674353

In [313]:
target_df = combined_df[combined_df['Source'] == 'twitter-posts']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6007451355858668

In [314]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6241422358848049

In [202]:
import pingouin as pg

In [203]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

labels_dicts = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        labels_dicts.append({'Labeler': _labeler,
                             'Item': _id,
                             'Labels': _labels})
labels_df = pd.DataFrame(labels_dicts)

icc = pg.intraclass_corr(data=labels_df, 
                         targets='Item', 
                         raters='Labeler',
                         ratings='Labels')

icc

DataError: No numeric types to aggregate