In [1]:
import os
import pandas as pd
from pathlib import Path
from sklearn.metrics import cohen_kappa_score
from itertools import combinations

In [2]:
# From Base Path to nyt-topic and change.org-topic datasets
folder_1 = '/Users/nitheeshvarma/Desktop/Nitheesh_Varma_2/nyt_topic'
folder_2 = '/Users/nitheeshvarma/Desktop/Nitheesh_Varma_2/change.org_topic'

In [3]:
for file_Name in os.listdir(folder_1):
    if file_Name not in ["lockdowns","masking_and_distancing","vaccination"]:
        file_Path = folder_1 + '/' + file_Name
        %run script.py --infile $file_Path  

In [4]:
for file_Name in os.listdir(folder_2):
        if file_Name not in ["lockdowns","masking_and_distancing","vaccination"]:
            file_Path = folder_2 + '/' + file_Name
            %run script.py --infile $file_Path 

In [5]:
def pre_processing(file):
    df = pd.read_csv(file)
    for column in df.columns[1:]:
        df[column] = df[column].astype(str)
    # Using the label missing to handle the missing values
    df = df.fillna("missing")
    return df

In [6]:
def Part_1(df):
    
    # Getting the column names and the number of annotators
    x = list(df.columns[1:])
    x_len = len(x)
      
    # Calculating the Average kappa Score for each annotator
    kappa_scores = {}
    t = 0
    d = 0
    for k in df.columns[1:]:
        for c in list(combinations(df.columns[1:], 2)):
            if k in c:
                kappa = cohen_kappa_score(df[c[0]], df[c[1]])
                d += kappa
                t = t + 1
        kappa_scores[k] = d/(len(x) - 1)  
        d = 0 
        
    # Removing annotators where average kappa score < 0.2
    for key, value in kappa_scores.items():
        if value < 0.2:
            df = df.drop([key],axis=1)   
    x = list(df.columns[1:])
    x 
    return df, kappa_scores, x    

In [7]:
def Part_2(df, kappa_scores, x):
    count_True = 0
    count_False = 0
    count_Missing = 0
    kappa_True = 0
    kappa_False=0
    kappa_Missing = 0

    for index, row in df.iterrows():
        for k in x:
            if row[k] == "True":
                count_True += 1
                kappa_True += kappa_scores[k]
            if row[k] == "False":
                count_False += 1
                kappa_False += kappa_scores[k]
            if row[k] == "missing":
                count_Missing += 1
                kappa_Missing += kappa_scores[k]
     
        # Finding Average Kappa Score for individual labels
        if count_True != 0:
            kappa_True = kappa_True / count_True
        if count_False != 0:
            kappa_False = kappa_False / count_False
        if count_Missing != 0:
            kappa_Missing = kappa_Missing / count_Missing

        # Checking which label has highest count.
        if count_True > count_False and  count_True > count_Missing:
            label = "True"
        if count_False > count_True  and count_False > count_Missing:
            label = "False"
        if count_Missing > count_True  and count_Missing > count_False:  
            label = "missing"
    
        # Checking if two labels have same count and not equal to 0
        if count_True == count_False and count_True != 0:
            if kappa_True > kappa_False:
                label = "True"
            else:
                label = "False"
    
        if count_True == count_Missing and count_True != 0:
            if kappa_True > kappa_Missing:
                label = "True"
            else:
                label = "missing"
    
        if count_False == count_Missing and count_False != 0:
            if kappa_False > kappa_Missing:
                label = "False"
            else: 
                label = "missing"
              
        df.at[index,'label'] = label
        
        count_True = 0
        count_False = 0
        count_Missing = 0
        kappa_True = 0
        kappa_False=0
        kappa_Missing = 0

    df = df.drop(x,axis=1)     
    return df

In [8]:
def process(file):
    df = pre_processing(file)
    df, kappa_scores, x = Part_1(df)
    df = Part_2(df, kappa_scores, x)
    df.to_csv(file, index=False)

In [9]:
for file in os.listdir(folder_1):
    if file.endswith(".csv") and (("lockdowns" in file) or ("masking_and_distancing" in file) or ("vaccination" in file)):
        full_path = folder_1 + '/' + file
        process(full_path)

In [10]:
for file in os.listdir(folder_2):
    if file.endswith(".csv") and (("lockdowns" in file) or ("masking_and_distancing" in file) or ("vaccination" in file)):
        full_path = folder_2 + '/' + file
        process(full_path)

In [11]:
# Merging all the csv files related to lockdowns for nyt-topic dataset
dataframes = []
for file in os.listdir(folder_1):
    if file.endswith(".csv") and (("lockdowns" in file)) :
        full_path = folder_1 + '/' + file
        dataframes.append(pd.read_csv(full_path))

result = pd.concat(dataframes)
file_name = folder_1 + '/' + "nyt_lockdowns.csv"
result.to_csv(file_name, index=False)

In [12]:
# Merging all the csv files related to masking_and_distancing for nyt-topic dataset
dataframes = []
for file in os.listdir(folder_1):
    if file.endswith(".csv") and (("masking_and_distancing" in file)) :
        full_path = folder_1 + '/' + file
        dataframes.append(pd.read_csv(full_path))

result = pd.concat(dataframes)
file_name = folder_1 + '/' + "nyt_masking_and_distancing.csv"
result.to_csv(file_name, index=False)

In [13]:
# Merging all the csv files related to vaccination for nyt-topic dataset
dataframes = []
for file in os.listdir(folder_1):
    if file.endswith(".csv") and (("vaccination" in file)) :
        full_path = folder_1 + '/' + file
        dataframes.append(pd.read_csv(full_path))

result = pd.concat(dataframes)
result.head()
file_name = folder_1 + '/' + "nyt_vaccination.csv"
result.to_csv(file_name, index=False)

In [14]:
# Merging all the csv files related to lockdowns for change.org dataset
dataframes = []
for file in os.listdir(folder_2):
    if file.endswith(".csv") and (("lockdowns" in file)) :
        full_path = folder_2 + '/' + file
        dataframes.append(pd.read_csv(full_path))

result = pd.concat(dataframes)
file_name = folder_2 + '/' + "change.org_lockdowns.csv"
result.to_csv(file_name, index=False)

In [15]:
# Merging all the csv files related to masking_and_distancing for change.org dataset
dataframes = []
for file in os.listdir(folder_2):
    if file.endswith(".csv") and (("masking_and_distancing" in file)) :
        full_path = folder_2 + '/' + file
        dataframes.append(pd.read_csv(full_path))

result = pd.concat(dataframes)
file_name = folder_2 + '/' + "change.org_masking_and_distancing.csv"
result.to_csv(file_name, index=False)

In [16]:
# Merging all the csv files related to vaccination for change.org dataset
dataframes = []
for file in os.listdir(folder_2):
    if file.endswith(".csv") and (("vaccination" in file)) :
        full_path = folder_2 + '/' + file
        dataframes.append(pd.read_csv(full_path))

result = pd.concat(dataframes)
file_name = folder_2 + '/' + "change.org_vaccination.csv"
result.to_csv(file_name, index=False)