In [1]:
from collections import defaultdict
import random

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='ticks', font_scale=1.2)
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
data_directory_path   = '/Users/maria/Documents/data/birth-control'
output_directory_path = '/Users/maria/Documents/output/birth-control'

In [3]:
maria_path = data_directory_path + '/labeling/label-discourse/labeled_by_maria.all.csv'
# leann_path = data_directory_path + '/labeling/label-sentences-leann/labeled_by_leann.all.csv'
leann_path = data_directory_path + '/labeling/label-discourse/labeled_by_leann_take4.csv'

<br><br>

# Load labeled data

## MARIA

In [4]:
maria_df = pd.read_csv(maria_path)

In [5]:
len(maria_df.index)

1243

In [6]:
maria_df['Labeler'] = 'Maria'

In [7]:
def rename_source(text):
    if text == 'reddit':
        return 'reddit-posts'
    return text

maria_df['Source'] = maria_df['Source'].apply(rename_source)

In [8]:
maria_df['Source'].value_counts()

twitter-posts      270
webmd-reviews      264
reddit-posts       250
twitter-replies    230
reddit-comments    229
Name: Source, dtype: int64

In [9]:
maria_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
822,822,twitter-replies,1142335452424396800,SHARING PERSONAL EXPERIENCES,"I have a period maybe 5/6 times a year, on 3/4...",Maria
1147,1147,webmd-reviews,w11992,SHARING PERSONAL EXPERIENCES,"Ive gotten terrible mood swings, almost to the...",Maria
1077,1077,webmd-reviews,w4962,SHARING PERSONAL BACKGROUND,I have been using the implanon for almost 6 mo...,Maria


## LEANN

In [10]:
leann_df = pd.read_csv(leann_path)

In [11]:
len(leann_df.index)

958

In [12]:
leann_df['Labeler'] = 'LeAnn'

In [13]:
leann_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
888,888,webmd-reviews,w15827,SHARING PERSONAL EXPERIENCES,Because I am period free for 3 months all beca...,LeAnn
946,946,webmd-reviews,w10784,SHARING PERSONAL EXPERIENCES,I have a problem worring about the IUD coming ...,LeAnn
951,951,webmd-reviews,w10378,META DISCUSSION,hopefully i'll stop spotting soon so i can hav...,LeAnn


In [14]:
leann_df['Source'].value_counts()

webmd-reviews      228
twitter-posts      219
reddit-posts       183
twitter-replies    172
reddit-comments    156
Name: Source, dtype: int64

## COMBINED

In [15]:
combined_df = pd.concat([maria_df, leann_df])
len(combined_df.index)

2201

In [16]:
combined_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
811,811,webmd-reviews,w11917,SHARING PERSONAL EXPERIENCES,My depression is super bad i really did like t...,LeAnn
661,661,twitter-replies,731274595022442500,SHARING PERSONAL EXPERIENCES,"Used IUD for years, zero side effects, 100% ef...",LeAnn
515,515,twitter-posts,1070473228156584000,META DISCUSSION,do y’all think there’s a listening device in i...,LeAnn


In [17]:
combined_df['Label'].value_counts(normalize=True)

SHARING PERSONAL EXPERIENCES                0.330304
SHARING INFORMATION                         0.107678
META DISCUSSION                             0.092685
SHARING OPINIONS AND PREFERENCES            0.089959
SHARING/DESCRIBING ADDITIONAL RESEARCH      0.074512
NONE                                        0.062699
SEEKING INFORMATION                         0.040436
SHARING PERSONAL BACKGROUND                 0.032712
SHARING FUTURE PLANS                        0.031804
SHARING CAUSAL REASONING / HYPOTHESIZING    0.028623
SHARING ADVICE                              0.027715
SEEKING EXPERIENCES                         0.024534
SHARING SECONDHAND EXPERIENCES              0.021354
SEEKING EMOTIONAL SUPPORT                   0.015902
SEEKING ADVICE                              0.005452
SHARING NORMALITY                           0.005452
SEEKING NORMALITY                           0.004089
SHARING EMOTIONAL SUPPORT                   0.004089
Name: Label, dtype: float64

In [18]:
def fix_slashes(x):
    return x.replace('/', '-')

combined_df['Label'] = combined_df['Label'].apply(fix_slashes)

In [19]:
combined_df['ID'] = combined_df['ID'].astype(str)

In [20]:
combined_df.sample(3)

Unnamed: 0.1,Unnamed: 0,Source,ID,Label,Text,Labeler
270,270,reddit-comments,dbqilc4,SHARING SECONDHAND EXPERIENCES,Most of the ladies I talked to had their perio...,LeAnn
188,188,reddit-comments,dbg02ve,SHARING INFORMATION,"If not grandfathered, the Copper IUD must be 1...",LeAnn
51,51,reddit-posts,9l83t6,SHARING PERSONAL EXPERIENCES,I'm now doing a 5 month Accutane course as sug...,LeAnn


In [21]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in combined_df.iterrows():
    id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
    id_data_dict[r['ID']] = r

dicts_for_comparison = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _dict = {'ID': _id,
                'Source': id_data_dict[_id]['Source'],
                'Text': id_data_dict[_id]['Text']}
    for _labeler, _labels in _labeler_labels_dict.items():
        _dict[_labeler] = _labels
    dicts_for_comparison.append(_dict)
df_for_comparison = pd.DataFrame(dicts_for_comparison)

df_for_comparison.sample(5)

Unnamed: 0,ID,Source,Text,Maria,LeAnn
852,w4534,webmd-reviews,I had no other choice though.,[SHARING PERSONAL EXPERIENCES],
470,1178355829269057500,twitter-posts,How do y’all feel about NEXPLANON ?,[SEEKING EXPERIENCES],"[META DISCUSSION, SEEKING EXPERIENCES]"
372,fe77tri,reddit-comments,I guess I’ll have to bear my stupid periods.,[NONE],
850,w9448,webmd-reviews,"mirena insertion, and removal, were difficult ...","[SHARING PERSONAL EXPERIENCES, SHARING OPINION...","[SHARING NORMALITY, SHARING OPINIONS AND PREFE..."
380,cukhr34,reddit-comments,I don't see anything to worry about.,[SHARING EMOTIONAL SUPPORT],


In [22]:
print(len(df_for_comparison.index))
df_for_comparison = df_for_comparison.dropna()
len(df_for_comparison.index)

1004


777

In [23]:
df_for_comparison.sample(5)

Unnamed: 0,ID,Source,Text,Maria,LeAnn
530,1138203159845908500,twitter-posts,Nexplanon have me like https://t.co/8EaAYM1CwX,[SHARING-DESCRIBING ADDITIONAL RESEARCH],[META DISCUSSION]
877,w4606,webmd-reviews,Do not get the Implanon!,[SHARING ADVICE],[SHARING ADVICE]
811,w9284,webmd-reviews,I had the Mirena IUD inserted three years ago.,[SHARING PERSONAL EXPERIENCES],[SHARING PERSONAL EXPERIENCES]
799,174547174309371900,twitter-replies,i lost my birth control pill,[SHARING PERSONAL EXPERIENCES],[META DISCUSSION]
507,824492301196075000,twitter-posts,"Personally I loved my IUD, never had issues wi...","[SHARING PERSONAL EXPERIENCES, SHARING OPINION...","[SHARING PERSONAL EXPERIENCES, SHARING OPINION..."


In [24]:
df_for_comparison.to_csv(data_directory_path + '/labeling/label-discourse/labels_for_comparison.csv')

<br><br><br><br>

# Get majority labels for all sources

In [25]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in df_for_comparison.iterrows():
    id_labeler_labels_dict[r['ID']]['Maria'] = sorted(r['Maria'])
    id_labeler_labels_dict[r['ID']]['LeAnn'] = sorted(r['LeAnn'])
    id_data_dict[r['ID']] = r

len(id_labeler_labels_dict)

777

In [26]:
id_text_dict = {}
for i, r in df_for_comparison.iterrows():
    id_text_dict[r['ID']] = r['Text']

In [35]:
target_label = 'SHARING PERSONAL EXPERIENCES'

for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if target_label in _maria_labels and target_label in _leann_labels:
        print(_id)
        print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
        print('Leann:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
        print(id_text_dict[_id])
        print()


14hem2
Maria: ['SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']
Since I've been so stressed out, I decided that since I had forgot a few pills, I would go off the pill for a month or two and let my body rest during the Christmas university break.

8j8jl4
Maria: ['SHARING PERSONAL BACKGROUND', 'SHARING PERSONAL EXPERIENCES', 'SHARING-DESCRIBING ADDITIONAL RESEARCH']
Leann: ['SHARING PERSONAL BACKGROUND', 'SHARING PERSONAL EXPERIENCES']
(Link for more info: https://www.loloestrin.com/loloestrin/lo-loestrin-faqs)

I recently went in for my annual appointment (at a different doctor since I relocated) and she asked if I was sexually active, to which I said yes

fdzgsu
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
It lasted for a few weeks but I figured it was because of the change .

bqr7xt
Maria: ['SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING P

In [31]:
agreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if sorted(_maria_labels) == sorted(_leann_labels):
        agreements.append(_id)
len(agreements)

500

In [33]:
target_label = 'SHARING CAUSAL REASONING - HYPOTHESIZING'

for _id in agreements:
    if target_label in id_labeler_labels_dict[_id]['Maria'] and target_label in id_labeler_labels_dict[_id]['LeAnn']:
        print(_id)
        print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
        print('Leann:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
        print(id_text_dict[_id])
        print()

fdzgsu
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
It lasted for a few weeks but I figured it was because of the change .

eejrji
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING']
Leann: ['SHARING CAUSAL REASONING - HYPOTHESIZING']
But I know it will probably take up to 6 months for my body to re-adjust.

6nhbkg
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES', 'SHARING-DESCRIBING ADDITIONAL RESEARCH']
Leann: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES', 'SHARING-DESCRIBING ADDITIONAL RESEARCH']
This lasted for MONTHS, so I read somewhere online that vitamin e and zinc help with this, and it did stop the bleeding for a couple weeks, but I just started spotting again today.

7vj4c9
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING CAUSAL REASONING -

In [27]:
disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if sorted(_maria_labels) != sorted(_leann_labels):
        disagreements.append(_id)
len(disagreements)

277

In [30]:
target_label = 'SHARING CAUSAL REASONING - HYPOTHESIZING'

for _id in disagreements:
    if (target_label in id_labeler_labels_dict[_id]['Maria'] or target_label in id_labeler_labels_dict[_id]['LeAnn']) and not (target_label in id_labeler_labels_dict[_id]['Maria'] and target_label in id_labeler_labels_dict[_id]['LeAnn']):
        print(_id)
        print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
        print('Leann:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
        print(id_text_dict[_id])
        print()

5a9opz
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES', 'SHARING-DESCRIBING ADDITIONAL RESEARCH']
I thought I would be fine since I was on the depo so long beforehand, and didn't even realize this could possibly be a symptom of the implant.

9l83t6
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES', 'SHARING-DESCRIBING ADDITIONAL RESEARCH']
Leann: ['SHARING PERSONAL EXPERIENCES', 'SHARING-DESCRIBING ADDITIONAL RESEARCH']
I'm now doing a 5 month Accutane course as suggested by my dermatologist because I'm still breaking out consistently with large cysts.

a3q72g
Maria: ['SEEKING INFORMATION']
Leann: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
I'm wondering if this is a Mirena crash or just really bad PMS since my period is due Monday.

93b7vq
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONA

In [153]:
id_text_dict['g3930c']

KeyError: 'g3930c'

In [29]:
maria_labels = df_for_comparison['Maria'].tolist()
leann_labels = df_for_comparison['LeAnn'].tolist()

maria_labels = ['_'.join(_label[0].split()) for _label in maria_labels]
leann_labels = ['_'.join(_label[0].split()) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('-')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('-')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('(')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('(')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split(')')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split(')')) for _label in leann_labels]

# for _id, _maria_labels in maria_id_labels_dict.items():
#     _maria_labels = ['_'.join(_label.split()) for _label in _maria_labels]
#     _leann_labels = ['_'.join(_label.split()) for _label in leann_id_labels_dict[_id]]
#     maria_labels.append(' '.join(_maria_labels))
#     leann_labels.append(' '.join(_leann_labels))

vectorizer = CountVectorizer().fit(maria_labels)
maria_Y = vectorizer.transform(maria_labels)
leann_Y = vectorizer.transform(leann_labels)
label_names = vectorizer.get_feature_names()

print(maria_Y.shape, leann_Y.shape)

print(classification_report(maria_Y, leann_Y, target_names=label_names))

(777, 18) (777, 18)
                                          precision    recall  f1-score   support

                         meta_discussion       0.62      0.73      0.67        89
                                    none       0.00      0.00      0.00        32
                          seeking_advice       0.75      0.50      0.60         6
               seeking_emotional_support       0.54      0.64      0.58        11
                     seeking_experiences       0.70      0.74      0.72        19
                     seeking_information       0.76      0.85      0.80        33
                       seeking_normality       1.00      0.75      0.86         4
                          sharing_advice       0.76      0.79      0.78        24
sharing_causal_reasoning___hypothesizing       0.75      0.23      0.35        13
  sharing_describing_additional_research       0.58      0.68      0.62        28
               sharing_emotional_support       1.00      0.50      0.67      

  _warn_prf(average, modifier, msg_start, len(result))


<br><br>

# Get majority labels for REDDIT POSTS

In [38]:
# target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_df = df_for_comparison[df_for_comparison['Source'] == 'reddit-posts']

In [39]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in target_df.iterrows():
    id_labeler_labels_dict[r['ID']]['Maria'] = sorted(r['Maria'])
    id_labeler_labels_dict[r['ID']]['LeAnn'] = sorted(r['LeAnn'])
    id_data_dict[r['ID']] = r

len(id_labeler_labels_dict)

139

In [40]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']

In [41]:
len(id_text_dict)

139

In [42]:
disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if sorted(_maria_labels) != sorted(_leann_labels):
        disagreements.append(_id)
len(disagreements)

56

In [43]:
for _id in random.sample(disagreements, 10):
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('Leann:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
    print()

d95ptp
Maria: ['SEEKING NORMALITY']
Leann: ['SEEKING EMOTIONAL SUPPORT', 'SEEKING NORMALITY']

axsgz5
Maria: ['SEEKING INFORMATION', 'SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING NORMALITY']
Leann: ['SEEKING INFORMATION']

eexoq2
Maria: ['SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING OPINIONS AND PREFERENCES', 'SHARING PERSONAL EXPERIENCES']

fc20ys
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']

f8a6jp
Maria: ['SHARING FUTURE PLANS']
Leann: ['SHARING PERSONAL EXPERIENCES']

ej4z6v
Maria: ['SEEKING INFORMATION', 'SHARING CAUSAL REASONING - HYPOTHESIZING']
Leann: ['SEEKING INFORMATION', 'SHARING PERSONAL BACKGROUND', 'SHARING PERSONAL EXPERIENCES']

a7yhtu
Maria: ['SHARING OPINIONS AND PREFERENCES']
Leann: ['SHARING OPINIONS AND PREFERENCES', 'SHARING PERSONAL EXPERIENCES']

b9bhp0
Maria: ['SEEKING ADVICE']
Leann: ['SEEKING INFORMATION']

y9a6w
Maria: ['SEEKING EXPERIENCES', 'SHARING CAUSAL REASONING -

In [44]:
id_text_dict['g3930c']

'I’m so over it wow.'

In [45]:
complete_disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _roz_labels = _labeler_labels_dict['LeAnn']
    if not any(_label in _roz_labels for _label in _maria_labels):
        complete_disagreements.append(_id)
len(complete_disagreements)

22

In [46]:
for _id in complete_disagreements:
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('LeAnn:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
    print()

gx89cs
Maria: ['META DISCUSSION']
LeAnn: ['SHARING ADVICE']

jgarl8
Maria: ['NONE']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

dql3vd
Maria: ['SHARING PERSONAL EXPERIENCES']
LeAnn: ['SEEKING EMOTIONAL SUPPORT']

esfea8
Maria: ['SHARING FUTURE PLANS']
LeAnn: ['SEEKING ADVICE', 'SHARING PERSONAL EXPERIENCES']

6ezqrm
Maria: ['SEEKING EXPERIENCES']
LeAnn: ['SEEKING ADVICE']

a3q72g
Maria: ['SEEKING INFORMATION']
LeAnn: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']

b9bhp0
Maria: ['SEEKING ADVICE']
LeAnn: ['SEEKING INFORMATION']

5nhkhl
Maria: ['NONE']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

ew0afd
Maria: ['SEEKING INFORMATION', 'SHARING FUTURE PLANS']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

c0l1vc
Maria: ['NONE']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

f8a6jp
Maria: ['SHARING FUTURE PLANS']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

8vtjr6
Maria: ['SHARING PERSONAL BACKGROUND']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

4sx3qr
Maria: ['SHARING CAUSAL REASONING 

In [47]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

117

In [48]:
for _id, _labels in random.sample(id_majority_labels_dict.items(), 10):
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

8efhj9
['SHARING OPINIONS AND PREFERENCES', 'SHARING PERSONAL EXPERIENCES']
I cannot wait to get it out even though it’s also done a lot of good for me.
8ex090
['SHARING PERSONAL EXPERIENCES']
I haven’t had even a bit of spotting except for I just went the loo and had (TMI) kind of a blob of discharge, the only way I can describe was like a circle of watery chocolate?
14hem2
['SHARING PERSONAL EXPERIENCES']
Since I've been so stressed out, I decided that since I had forgot a few pills, I would go off the pill for a month or two and let my body rest during the Christmas university break.
117t5i
['SEEKING EXPERIENCES', 'SHARING PERSONAL EXPERIENCES']
This only started when I got the IUD - has anyone else experienced this?
eizk8y
['SHARING PERSONAL EXPERIENCES']
But then started to go to my forehead and cheeks.
9gvsyi
['SHARING PERSONAL EXPERIENCES']
about 2weeks ago I randomly got a period but it was dead heavy and gave me real bad stomach pains and still 11 days on I still get those sto

In [49]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'reddit-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/label-discourse/majority.reddit_posts.csv')

In [50]:
final_df.sample(3)

Unnamed: 0,ID,Label,Text,Source,Decision
62,jjsyfr,SHARING PERSONAL EXPERIENCES,So I've been on the combination pill for like ...,reddit-posts,two or more
32,6nhbkg,SHARING-DESCRIBING ADDITIONAL RESEARCH,"This lasted for MONTHS, so I read somewhere on...",reddit-posts,two or more
21,du35ct,SEEKING EMOTIONAL SUPPORT,so i’m kinda freaked out rn.,reddit-posts,two or more


In [51]:
maria_labels = target_df['Maria'].tolist()
leann_labels = target_df['LeAnn'].tolist()

maria_labels = ['_'.join(_label[0].split()) for _label in maria_labels]
leann_labels = ['_'.join(_label[0].split()) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('-')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('-')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('(')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('(')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split(')')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split(')')) for _label in leann_labels]

# for _id, _maria_labels in maria_id_labels_dict.items():
#     _maria_labels = ['_'.join(_label.split()) for _label in _maria_labels]
#     _leann_labels = ['_'.join(_label.split()) for _label in leann_id_labels_dict[_id]]
#     maria_labels.append(' '.join(_maria_labels))
#     leann_labels.append(' '.join(_leann_labels))

vectorizer = CountVectorizer().fit(maria_labels)
maria_Y = vectorizer.transform(maria_labels)
leann_Y = vectorizer.transform(leann_labels)
label_names = vectorizer.get_feature_names()

print(maria_Y.shape, leann_Y.shape)

print(classification_report(maria_Y, leann_Y, target_names=label_names))

(139, 14) (139, 14)
                                          precision    recall  f1-score   support

                         meta_discussion       0.50      0.50      0.50         2
                                    none       0.00      0.00      0.00         4
                          seeking_advice       0.67      0.40      0.50         5
               seeking_emotional_support       0.50      0.80      0.62         5
                     seeking_experiences       0.75      0.75      0.75         8
                     seeking_information       0.82      0.90      0.86        20
                       seeking_normality       1.00      0.67      0.80         3
sharing_causal_reasoning___hypothesizing       1.00      0.14      0.25         7
  sharing_describing_additional_research       0.71      0.83      0.77         6
                    sharing_future_plans       1.00      0.47      0.64        15
                     sharing_information       1.00      1.00      1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<br><br>

# Get majority labels for REDDIT COMMENTS

In [52]:
# target_df = combined_df[combined_df['Source'] == 'reddit-comments']
target_df = df_for_comparison[df_for_comparison['Source'] == 'reddit-comments']

In [53]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
id_data_dict = {}
for i, r in target_df.iterrows():
    id_labeler_labels_dict[r['ID']]['Maria'] = sorted(r['Maria'])
    id_labeler_labels_dict[r['ID']]['LeAnn'] = sorted(r['LeAnn'])
    id_data_dict[r['ID']] = r

len(id_labeler_labels_dict)

141

In [54]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']

len(id_text_dict)

141

In [55]:
disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _leann_labels = _labeler_labels_dict['LeAnn']
    if sorted(_maria_labels) != sorted(_leann_labels):
        disagreements.append(_id)
len(disagreements)

43

In [56]:
for _id in random.sample(disagreements, 10):
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('Leann:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
    print()

d6r246m
Maria: ['SHARING NORMALITY']
Leann: ['SHARING INFORMATION']

d518hd9
Maria: ['SHARING PERSONAL BACKGROUND', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']

ck8djlp
Maria: ['SHARING OPINIONS AND PREFERENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']

c69x6ig
Maria: ['SHARING OPINIONS AND PREFERENCES', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']

f1cb466
Maria: ['SHARING OPINIONS AND PREFERENCES']
Leann: ['META DISCUSSION', 'SHARING OPINIONS AND PREFERENCES']

dfq18p0
Maria: ['SHARING INFORMATION', 'SHARING NORMALITY']
Leann: ['SHARING NORMALITY']

d7g7sbu
Maria: ['SHARING CAUSAL REASONING - HYPOTHESIZING', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']

ee7olrd
Maria: ['SHARING FUTURE PLANS']
Leann: ['SHARING OPINIONS AND PREFERENCES']

fnvse92
Maria: ['SHARING OPINIONS AND PREFERENCES', 'SHARING PERSONAL EXPERIENCES']
Leann: ['SHARING PERSONAL EXPERIENCES']

dn2h5a1
Maria: ['SHARING OPINIONS AND PREFER

In [57]:
id_text_dict['f4xu0cl']

'and I’m glad I did it.'

In [58]:
complete_disagreements = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _maria_labels = _labeler_labels_dict['Maria']
    _roz_labels = _labeler_labels_dict['LeAnn']
    if not any(_label in _roz_labels for _label in _maria_labels):
        complete_disagreements.append(_id)
len(complete_disagreements)

23

In [59]:
for _id in complete_disagreements:
    # print(id_text_dict[_id])
    print(_id)
    print('Maria:', sorted(id_labeler_labels_dict[_id]['Maria']))
    print('LeAnn:', sorted(id_labeler_labels_dict[_id]['LeAnn']))
    print()

fjo8aa9
Maria: ['NONE']
LeAnn: ['SHARING OPINIONS AND PREFERENCES']

d6ebw1r
Maria: ['NONE']
LeAnn: ['SEEKING EXPERIENCES']

d6r246m
Maria: ['SHARING NORMALITY']
LeAnn: ['SHARING INFORMATION']

fgborgo
Maria: ['META DISCUSSION']
LeAnn: ['SHARING INFORMATION']

cib65wx
Maria: ['NONE']
LeAnn: ['SHARING INFORMATION']

eeye263
Maria: ['META DISCUSSION']
LeAnn: ['SHARING INFORMATION']

e6yndtc
Maria: ['SHARING SECONDHAND EXPERIENCES']
LeAnn: ['SHARING INFORMATION', 'SHARING PERSONAL EXPERIENCES']

f7jnr89
Maria: ['SHARING INFORMATION']
LeAnn: ['SHARING CAUSAL REASONING - HYPOTHESIZING']

e64pc86
Maria: ['META DISCUSSION']
LeAnn: ['SHARING INFORMATION']

ck8djlp
Maria: ['SHARING OPINIONS AND PREFERENCES']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

d2ixthu
Maria: ['NONE']
LeAnn: ['SHARING-DESCRIBING ADDITIONAL RESEARCH']

dehb91h
Maria: ['NONE']
LeAnn: ['SEEKING INFORMATION']

ezfjqyh
Maria: ['SHARING OPINIONS AND PREFERENCES']
LeAnn: ['SHARING PERSONAL EXPERIENCES']

epdyehl
Maria: ['SHARING I

In [60]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

118

In [61]:
for _id, _labels in random.sample(id_majority_labels_dict.items(), 10):
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

egkkyy6
['SHARING INFORMATION', 'SHARING SECONDHAND EXPERIENCES']
People tend to like Nexplanon (can't beat that 99.95% effectiveness rating, quite literally)
dgvijd4
['SHARING OPINIONS AND PREFERENCES']
Wow that sounds like everything I hope to happen (except the boobs shrinking that's like the only really big upside to nexplanon for me but maybe it won't be too much)!
erxl2jk
['SHARING-DESCRIBING ADDITIONAL RESEARCH']
My doctor hasn't been concerned by it.
ecekvux
['SHARING EMOTIONAL SUPPORT']
I feel your pain.
flfknnj
['SHARING PERSONAL EXPERIENCES']
It was buried in my bicep muscle.
dkkul1w
['SHARING PERSONAL EXPERIENCES']
I have had my Paragard since 2013, and in my experience, I have a period that is usual (for me), but occasionally, if I am stressed or travelling, or my body just randomly decides to be weird, it does something different.
cgqejbn
['SHARING PERSONAL EXPERIENCES']
My parents health insurance covered 100% of 2 IUD insertion and one IUD removal.
f5pm41h
['SHARING INF

In [62]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'reddit-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.sample(3)

Unnamed: 0,ID,Label,Text,Source,Decision
54,e05lo37,SHARING PERSONAL EXPERIENCES,The spotting happened to me!,reddit-posts,two or more
17,czi2621,SHARING INFORMATION,"Lastly, you *can* get sterilized under 30 and ...",reddit-posts,two or more
92,fgq2t3u,SHARING INFORMATION,They’re just trying to help you find the best ...,reddit-posts,two or more


In [63]:
final_df.to_csv(data_directory_path + '/labeling/label-discourse/majority.reddit_comments.csv')

In [64]:
maria_labels = target_df['Maria'].tolist()
leann_labels = target_df['LeAnn'].tolist()

maria_labels = ['_'.join(_label[0].split()) for _label in maria_labels]
leann_labels = ['_'.join(_label[0].split()) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('-')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('-')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split('(')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split('(')) for _label in leann_labels]

maria_labels = ['_'.join(_label.split(')')) for _label in maria_labels]
leann_labels = ['_'.join(_label.split(')')) for _label in leann_labels]

# for _id, _maria_labels in maria_id_labels_dict.items():
#     _maria_labels = ['_'.join(_label.split()) for _label in _maria_labels]
#     _leann_labels = ['_'.join(_label.split()) for _label in leann_id_labels_dict[_id]]
#     maria_labels.append(' '.join(_maria_labels))
#     leann_labels.append(' '.join(_leann_labels))

vectorizer = CountVectorizer().fit(maria_labels)
maria_Y = vectorizer.transform(maria_labels)
leann_Y = vectorizer.transform(leann_labels)
label_names = vectorizer.get_feature_names()

print(maria_Y.shape, leann_Y.shape)

print(classification_report(maria_Y, leann_Y, target_names=label_names))

(141, 16) (141, 16)
                                          precision    recall  f1-score   support

                         meta_discussion       0.50      0.25      0.33         4
                                    none       0.00      0.00      0.00         8
               seeking_emotional_support       1.00      1.00      1.00         2
                     seeking_experiences       0.67      1.00      0.80         4
                     seeking_information       0.50      1.00      0.67         1
                          sharing_advice       0.76      0.81      0.79        16
sharing_causal_reasoning___hypothesizing       0.50      1.00      0.67         1
  sharing_describing_additional_research       0.60      1.00      0.75         3
               sharing_emotional_support       1.00      0.50      0.67         4
                    sharing_future_plans       1.00      0.60      0.75         5
                     sharing_information       0.72      0.87      0.79      

  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
# id_text_dict = {}
# for i, r in target_df.iterrows():
#     id_text_dict[r['ID']] = r['Text']
# len(id_text_dict)

In [66]:
# ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
# len(ids_labeled_by_roz)

In [67]:
# id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
# for i, r in target_df.iterrows():
#     if r['ID'] in ids_labeled_by_roz:
#         id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
# len(id_labeler_labels_dict)

In [68]:
# id_majority_labels_dict = {}
# for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
#     _label_count_dict = defaultdict(int)
#     for _labeler, _labels in _labeler_labels_dict.items():
#         for _label in _labels:
#             _label_count_dict[_label] += 1
#     _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
#     if _majority_labels:
#         id_majority_labels_dict[_id] = _majority_labels
# len(id_majority_labels_dict)

In [69]:
# for _id, _labels in id_majority_labels_dict.items():
#     print(_id)
#     print(_labels)
#     print(id_text_dict[_id])
#     print('=============================================')

In [70]:
# final_dicts = []
# for _id, _labels in id_majority_labels_dict.items():
#     for _label in _labels:
#         final_dicts.append({'ID': _id,
#                             'Label': _label,
#                             'Text': id_text_dict[_id],
#                             'Source': 'reddit-comments',
#                             'Decision': 'two or more'})
# final_df = pd.DataFrame(final_dicts)
# final_df.to_csv(data_directory_path + '/labeling/majority.reddit_comments.csv')

<br><br>

# Get majority labels for TWITTER POSTS

In [71]:
target_df = combined_df[combined_df['Source'] == 'twitter-posts']

In [72]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

201

In [73]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

0

In [298]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

292

In [299]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

285

In [300]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1266030580036259800
['SEEKING EMOTIONAL SUPPORT']
I want the IUD so bad but I’m scared 🥺🙄
1215895024107507700
['DISCOURSE']
i cannot believe there’s a non-hormonal oral contraceptive that’s been on the market for thirty YEARS but it’s only available in india. this world is a sick joke
1208819982840344600
['SEEKING EXPERIENCES']
Hey ladies, Do any of y’all have an IUD and use a menstrual cup?
155399192255283200
['DISCOURSE']
My cousin's wife just told me about a friend who was injured when an "IUD" exploded in Afghanistan.
392974329991790600
['DISCOURSE']
I said I have the birth control implant in my arm and Petty Officer Burdette is like "what's that have to do anything w/your vagina?!"
964617929894527000
['PROVIDING INFORMATIONAL SUPPORT']
Check out this interesting factoid from our friends at the Oral Contraception Over the Counter Coalition! #FreeThePill https://t.co/nncjs2Abe9
248648850909315070
['SEEKING INFORMATIONAL SUPPORT']
So has anyone heard about the Mirena IUD Recall???
49

In [301]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'twitter-posts',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.twitter_posts.csv')

<br><br>

# Get majority labels for TWITTER COMMENTS

In [302]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

In [303]:
id_text_dict = {}
for i, r in target_df.iterrows():
    id_text_dict[r['ID']] = r['Text']
len(id_text_dict)

502

In [304]:
ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))
len(ids_labeled_by_roz)

284

In [305]:
id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    if r['ID'] in ids_labeled_by_roz:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])
len(id_labeler_labels_dict)

284

In [306]:
id_majority_labels_dict = {}
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    _label_count_dict = defaultdict(int)
    for _labeler, _labels in _labeler_labels_dict.items():
        for _label in _labels:
            _label_count_dict[_label] += 1
    _majority_labels = [_label for _label, _count in _label_count_dict.items() if _count >= 2]
    if _majority_labels:
        id_majority_labels_dict[_id] = _majority_labels
len(id_majority_labels_dict)

283

In [307]:
for _id, _labels in id_majority_labels_dict.items():
    print(_id)
    print(_labels)
    print(id_text_dict[_id])
    print('=============================================')

1306801969118445600
['PROVIDING EXPERIENCES']
@roxiqt Had severe panic attacks after getting an IUD so I had to have it removed, was on hormonal bc for so long that I had swelling of the brain, and now I get a shot every 3 months so a man can cum in me… fuck being female
473233002987020300
['PROVIDING INFORMATIONAL SUPPORT']
@NinoDemayo plan b is hormones, usually a higher dosage than in birth control, if it's toxic after a while birth control pill wld b deadly
1160981232349528000
['PROVIDING EXPERIENCES']
@IvetteBrianna_ it’s soo convenient! i’ve been on it for a few months now. it’s called the depo shot. i didn’t want an IUD or implant and i’m too forgetful for the pills lol
664898788327206900
['DISCOURSE']
.@absmom1 I do know that. You cannot force every woman in the country to take a hormonal pill/get an IUD against her will or her family's
1223701281417695200
['PROVIDING EXPERIENCES']
Anyways, I’m coming up on my IUD’s third birthday next month and I’d like to report that I have s

In [308]:
final_dicts = []
for _id, _labels in id_majority_labels_dict.items():
    for _label in _labels:
        final_dicts.append({'ID': _id,
                            'Label': _label,
                            'Text': id_text_dict[_id],
                            'Source': 'twitter-replies',
                            'Decision': 'two or more'})
final_df = pd.DataFrame(final_dicts)
final_df.to_csv(data_directory_path + '/labeling/majority.twitter_replies.csv')

<br><br>

# Calculate agreement

In [74]:
import nltk
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance

In [75]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_df['Label'].value_counts()

SHARING PERSONAL EXPERIENCES                158
SEEKING INFORMATION                          58
SHARING OPINIONS AND PREFERENCES             30
NONE                                         27
SEEKING EXPERIENCES                          25
SHARING FUTURE PLANS                         24
SHARING CAUSAL REASONING - HYPOTHESIZING     22
SEEKING EMOTIONAL SUPPORT                    20
SHARING-DESCRIBING ADDITIONAL RESEARCH       19
SHARING PERSONAL BACKGROUND                  17
SEEKING ADVICE                               10
SEEKING NORMALITY                             7
SHARING INFORMATION                           5
SHARING SECONDHAND EXPERIENCES                5
META DISCUSSION                               4
SHARING NORMALITY                             1
SHARING ADVICE                                1
Name: Label, dtype: int64

In [87]:
# target_df = combined_df[combined_df['Source'] == 'reddit-posts']
target_labels = ['SHARING PERSONAL EXPERIENCES',
                 'SEEKING INFORMATION',
                 'SHARING OPINIONS AND PREFERENCES',
                 'SEEKING EXPERIENCES',
                 'SHARING FUTURE PLANS',
                 'SEEKING EMOTIONAL SUPPORT',
                 'SHARING-DESCRIBING ADDITIONAL RESEARCH',
                 'SHARING PERSONAL BACKGROUND',
                 'SHARING CAUSAL REASONING - HYPOTHESIZING ']

# ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in combined_df.iterrows():
    # if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
    if r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.7429053628691862

In [76]:
target_df = combined_df[combined_df['Source'] == 'reddit-posts']

# ids_labeled_by_roz = list(set([r['ID'] for i, r in target_df.iterrows() if r['Labeler'] == 'Roz']))

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    # if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
    if r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6736417016717724

In [78]:
target_df = combined_df[combined_df['Source'] == 'reddit-comments']

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    # if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
    if r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.7065899925681356

In [79]:
target_df = combined_df[combined_df['Source'] == 'twitter-posts']

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    # if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
    if r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.8118498101080558

In [80]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    # if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
    if r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.7600820194589173

In [89]:
target_df = combined_df[combined_df['Source'] == 'webmd-reviews']

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    # if r['ID'] in ids_labeled_by_roz and r['Label'] in target_labels:
    if r['Label'] in target_labels:
        id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

task_data = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        task_data.append((_labeler, _id, frozenset(_labels)))

task = AnnotationTask(distance = masi_distance)
task.load_array(task_data)
task.alpha()

0.6642216279250455

In [82]:
import pingouin as pg

In [90]:
target_df = combined_df[combined_df['Source'] == 'twitter-replies']

id_labeler_labels_dict = defaultdict(lambda: defaultdict(list))
for i, r in target_df.iterrows():
    id_labeler_labels_dict[r['ID']][r['Labeler']].append(r['Label'])

labels_dicts = []
for _id, _labeler_labels_dict in id_labeler_labels_dict.items():
    for _labeler, _labels in _labeler_labels_dict.items():
        labels_dicts.append({'Labeler': _labeler,
                             'Item': _id,
                             'Labels': _labels})
labels_df = pd.DataFrame(labels_dicts)

icc = pg.intraclass_corr(data=labels_df, 
                         targets='Item', 
                         raters='Labeler',
                         ratings='Labels')

icc

  return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)


AssertionError: Data must have at least 5 non-missing values.

In [93]:
labels_df.sample(5)

Unnamed: 0,Labeler,Item,Labels
137,Maria,1270081617768231000,[SHARING OPINIONS AND PREFERENCES]
55,Maria,697199627691425800,[SHARING PERSONAL EXPERIENCES]
114,Maria,1288319310071464000,[SHARING OPINIONS AND PREFERENCES]
100,LeAnn,1235942125847502800,[META DISCUSSION]
74,Maria,906919986790363100,[META DISCUSSION]
