In [1]:
import pandas as pd
import numpy as np
import krippendorff

In [2]:
PATH = '../data/data-labelling/annotated-data/'
andy_df = pd.read_csv(PATH+'Andy.csv')
preethi_df = pd.read_csv(PATH+'Preethi.csv')
ravihari_df = pd.read_csv(PATH+'Ravihari.csv')

masked_data = pd.read_csv('../data/data-labelling/masked-data-for-labelling.csv')
model_data = pd.read_csv('../data/data-labelling/masked-data-with-labels.csv')

In [3]:
andy_df.columns

Index(['id', 'created_at', 'conversation_id', 'tweet', 'username', 'Sentiment',
       'Unnamed: 6'],
      dtype='object')

In [4]:
masked_data['Sentiment(Andy)'] = andy_df['Sentiment']
masked_data['Sentiment(Preethi)'] = preethi_df['Sentiment']
masked_data['Sentiment(Ravihari)'] = ravihari_df['Sentiment']

In [5]:
masked_data.head()

Unnamed: 0,id,created_at,conversation_id,tweet,username,Sentiment(Andy),Sentiment(Preethi),Sentiment(Ravihari)
0,1443684606004318218,2021-09-30 21:10:07+00:00,1443684606004318218,#COVID19 cases are decreasing in the U.S. but ...,CDC,POSITIVE,POSITIVE,NEGATIVE
1,1438928262080696324,2021-09-17 18:10:06+00:00,1438928262080696324,Many hospitals are overwhelmed with #COVID19 p...,CDC,NEGATIVE,NEGATIVE,NEGATIVE
2,1433497474791006208,2021-09-02 18:30:05+00:00,1433497474791006208,In addition to known risk factors for severe #...,CDC,NEUTRAL,NEGATIVE,NEGATIVE
3,1428386373166374912,2021-08-19 16:00:24+00:00,1428386373166374912,Join CDC’s #COVID19 Partner Update Call Monday...,CDC,NEUTRAL,NEUTRAL,NEUTRAL
4,1422649478365040640,2021-08-03 20:04:01+00:00,1422649478365040640,Teachers &amp; therapists: #COVID19 materials ...,CDC,NEUTRAL,POSITIVE,POSITIVE


In [6]:
# Get the most frequent Sentiment label from three annotators
sentiment = masked_data[['Sentiment(Andy)','Sentiment(Preethi)','Sentiment(Ravihari)']].mode(axis=1)
masked_data['Sentiment'] = sentiment[0].values

masked_data.to_csv('../data/data-labelling/annotated-data.csv', index=False)

In [7]:
# Calculate inter-annotator agreement 

# Convert strings to labels
sentiments = {'POSITIVE':0, 'NEGATIVE':1, 'NEUTRAL':2}

masked_data['Sentiment1'] = masked_data['Sentiment(Andy)'].apply(lambda x:sentiments[x])
masked_data['Sentiment2'] = masked_data['Sentiment(Preethi)'].apply(lambda x:sentiments[x])
masked_data['Sentiment3'] = masked_data['Sentiment(Ravihari)'].apply(lambda x:sentiments[x])

sentiment_df = masked_data[['Sentiment1','Sentiment2','Sentiment3']]

# Use Kripendorff's alpha
k_alpha = krippendorff.alpha(reliability_data=sentiment_df)
print('Krippendorff\'s alpha:', k_alpha)

Krippendorff's alpha: 0.18431346501637658


In [8]:
# Get the tweets with no-conflict labels
no_conflict_df = pd.DataFrame(columns=masked_data.columns)

# Check the number of unique labels in three columns and select rows with less than equal to 2 unique labels
no_conflict_series = masked_data[['Sentiment(Andy)','Sentiment(Preethi)','Sentiment(Ravihari)']].nunique(axis=1)
no_conflict_indices = []
for index, value in no_conflict_series.items():
    if value <= 2:
        no_conflict_indices.append(index)

no_conflict_df = masked_data[masked_data.index.isin(no_conflict_indices)]

# Get the most frequent Sentiment label from three annotators
sentiment = no_conflict_df[['Sentiment(Andy)','Sentiment(Preethi)','Sentiment(Ravihari)']].mode(axis=1)
no_conflict_df['Sentiment'] = sentiment[0].values

no_conflict_df.to_csv('../data/data-labelling/no-conflict-data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_conflict_df['Sentiment'] = sentiment[0].values


In [9]:
# Calculate inter-annotator agreement 

# Convert strings to labels
sentiments = {'POSITIVE':0, 'NEGATIVE':1, 'NEUTRAL':2}

no_conflict_df['Sentiment1'] = no_conflict_df['Sentiment(Andy)'].apply(lambda x:sentiments[x])
no_conflict_df['Sentiment2'] = no_conflict_df['Sentiment(Preethi)'].apply(lambda x:sentiments[x])
no_conflict_df['Sentiment3'] = no_conflict_df['Sentiment(Ravihari)'].apply(lambda x:sentiments[x])

sentiment_df = no_conflict_df[['Sentiment1','Sentiment2','Sentiment3']]

# Use Kripendorff's alpha
k_alpha = krippendorff.alpha(reliability_data=sentiment_df)
print('Krippendorff\'s alpha:', k_alpha)

Krippendorff's alpha: 0.1775086267476066


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_conflict_df['Sentiment1'] = no_conflict_df['Sentiment(Andy)'].apply(lambda x:sentiments[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_conflict_df['Sentiment2'] = no_conflict_df['Sentiment(Preethi)'].apply(lambda x:sentiments[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_conflict_

In [11]:
# Calculate similarity in human annotation & model labels
masked_data['sentiment_predicted'] = model_data['sentiment']
masked_data['sentiment_predicted'] = masked_data['sentiment_predicted'].replace({'positive':'POSITIVE', 'neutral':'NEUTRAL', 'negative':'NEGATIVE'})

correct_sentiment_label = 0

for index, row in masked_data.iterrows():
    if row['Sentiment'] == row['sentiment_predicted']:
        correct_sentiment_label += 1
        
print('Similarity in human & model labels:',correct_sentiment_label/3000)

0.6916666666666667
