In [7]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from statsmodels.stats import inter_rater as irr
from sklearn import preprocessing
import numpy as np

In [8]:
df_cmv=pd.read_csv("../data/cmv_original.csv")
df_cmv

Unnamed: 0,thread_id,comment_id,sentence,a1,a2,a3,gold_label
0,t3_71l9yj,dnbz2sl,I don't know anyone who buys Apple products to...,Anecdote,Assumption,Assumption,Assumption
1,t3_71l9yj,dnbz2sl,At work I use a Dell desktop that probably cos...,Anecdote,Assumption,Continue,
2,t3_71l9yj,dnbz2sl,"At home I have a $1,500 MacBook Air with a sol...",Anecdote,Continue,Continue,Continue
3,t3_71l9yj,dnbz2sl,My MacBook is *always* faster and more reliabl...,Anecdote,,Assumption,
4,t3_71l9yj,dnbz2sl,I think both product lines probably have their...,Assumption,,,
...,...,...,...,...,...,...,...
6624,t3_5o7nm3,dchvjl7,"Fuck those people, shed them like a tear and ...",,,,
6625,t3_6ihcuk,dj6lpdi,"Basically, the fallacy is claiming that becaus...",,,,Assumption
6626,t3_6ihcuk,dj6lpdi,Classic examples are baldness and clouds.,,,,
6627,t3_6ihcuk,dj68d5b,"The good thing about choosing ""viability"" as t...",,,,Assumption


In [9]:
df_cmv["comment_id"].value_counts().shape[0]

980

In [10]:
df_cmv.shape[0]

6629

In [11]:
df_cmv["gold_label"].value_counts()

Assumption          2724
Continue            1661
None                1487
Anecdote             323
Statistics/Study      81
Definition            65
Other                 30
Testimony             28
Name: gold_label, dtype: int64

In [12]:
def update_items(df, row_name):
    last_item = df[row_name].loc[0]
    updated_items = []
    for _, item in df[row_name].iteritems():
        if item != "Continue":
            last_item = item
        updated_items.append(last_item)

    df[f"{row_name}_updated"] = updated_items

update_items(df_cmv, "a1")
update_items(df_cmv, "a2")
update_items(df_cmv, "a3")

In [13]:
# update gold label
df_cmv["gold_label_updated"] = [float("nan")] * len(df_cmv["gold_label"])

for i, row in df_cmv.iterrows():

    a1, a2, a3 = str(row["a1_updated"]), str(row["a2_updated"]), str(row["a3_updated"])
    counts = {}

    v_a1 = counts.get(a1, 0)
    counts[a1] = v_a1 + 1
    v_a2 = counts.get(a2, 0)
    counts[a2] = v_a2 + 1
    v_a3 = counts.get(a3, 0)
    counts[a3] = v_a3 + 1

    max_count = max(counts.values())
    max_count_keys = [k for k, v in counts.items() if v == max_count]

    if a1 == 'nan' and a2 == 'nan' and a3 == 'nan':
        df_cmv.at[i, "gold_label_updated"] = row['gold_label'] if row['gold_label'] != "Continue" else float("nan")
    elif max_count == 1:
        df_cmv.at[i, "gold_label_updated"] = float("nan")
    else:
        df_cmv.at[i, "gold_label_updated"] = max_count_keys[0]

In [14]:
df_cmv[["a1_updated", "a2_updated", "a3_updated", "gold_label", "gold_label_updated"]].tail()

Unnamed: 0,a1_updated,a2_updated,a3_updated,gold_label,gold_label_updated
6624,,,,,
6625,,,,Assumption,Assumption
6626,,,,,
6627,,,,Assumption,Assumption
6628,,,,Assumption,Assumption


In [36]:
# Check how rows there are where where we have no annotation data
(df_cmv[["a1_updated", "a2_updated", "a3_updated"]].isna().all(axis=1)).sum()

5330

Calculating inter annotator agreement only on samples that have annotator data

In [38]:
fully_annotated_idx = (df_cmv[["a1_updated", "a2_updated", "a3_updated"]].isna().all(axis=1) == False)

fully_annotated_idx.sum()

1299

In [40]:

le = preprocessing.LabelEncoder()

df_cmv_no_nan = df_cmv[fully_annotated_idx]

le.fit(df_cmv_no_nan[["a1"]])

print(le.classes_)

a1 = le.transform(df_cmv_no_nan[["a1"]])
a2 = le.transform(df_cmv_no_nan[["a2"]])
a3 = le.transform(df_cmv_no_nan[["a3"]])

concat = np.vstack((a1, a2, a3)).T

agg = irr.aggregate_raters(concat)

irr.fleiss_kappa(agg[0], method='fleiss')

['Anecdote' 'Assumption' 'Common ground' 'Continue' 'Definition' 'None'
 'Other' 'Statistics/Study' 'Testimony']


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.09191039437523739

In [15]:
def is_nan_or_none(x):
    if isinstance(x, float):
        return not math.isnan(x)
    elif isinstance(x, str):
        return x != "None"
    else:
        return bool(x)

df_cmv_filtered = df_cmv[df_cmv["gold_label_updated"].apply(is_nan_or_none)]

In [16]:
df_cmv_filtered["gold_label_updated"].value_counts()

Assumption          2861
Anecdote             370
Statistics/Study      82
Definition            66
Other                 38
Testimony             33
Common ground          1
Name: gold_label_updated, dtype: int64

In [9]:
df_cmv["gold_label"].value_counts()

Assumption          2724
Continue            1661
None                1487
Anecdote             323
Statistics/Study      81
Definition            65
Other                 30
Testimony             28
Name: gold_label, dtype: int64

In [10]:
df_cmv_distilled = df_cmv_filtered[['thread_id', 'comment_id', 'sentence', 'a1_updated', 'a2_updated', 'a3_updated', 'gold_label_updated']]
df_cmv_distilled = df_cmv_distilled[((df_cmv_distilled["gold_label_updated"] == "Definition") | (df_cmv_distilled["gold_label_updated"] == "Common ground")) == False]
df_cmv_distilled.columns = ['thread_id', 'comment_id', 'sentence', 'a1', 'a2', 'a3', 'label']
df_cmv_distilled["label"] = df_cmv_distilled["label"].str.lower()
df_cmv_distilled["label"] = df_cmv_distilled["label"].str.replace("statistics/study", "statistics")

In [11]:
df_cmv_distilled["label"].value_counts()

assumption    2861
anecdote       370
statistics      82
other           38
testimony       33
Name: label, dtype: int64

In [12]:
df_cmv_distilled.loc[1290:]

Unnamed: 0,thread_id,comment_id,sentence,a1,a2,a3,label
1293,t3_5w9qrn,de9iil1,Think about all the other things men and women...,Anecdote,Anecdote,Testimony,anecdote
1294,t3_5w9qrn,de9iil1,They used to go to different schools.,Anecdote,Anecdote,Testimony,anecdote
1298,t3_5w9qrn,de9iil1,That's how we'll one day see sex segregation i...,Assumption,Statistics/Study,Assumption,assumption
1299,t3_4x7vi8,d6dcbvj,The idea in general is whether there is an ent...,,,,assumption
1300,t3_4x7vi8,d6dcbvj,The best arguments against free will (B. F. Sk...,,,,assumption
...,...,...,...,...,...,...,...
6622,t3_5o7nm3,dchyd0p,"Last I checked, destroying infrastructure isn'...",,,,assumption
6623,t3_5o7nm3,dchvjl7,At that point you're going through a lot of an...,,,,assumption
6625,t3_6ihcuk,dj6lpdi,"Basically, the fallacy is claiming that becaus...",,,,assumption
6627,t3_6ihcuk,dj68d5b,"The good thing about choosing ""viability"" as t...",,,,assumption


In [14]:
train, test = train_test_split(df_cmv_distilled, test_size=0.4, random_state=42, stratify=df_cmv_distilled["label"])

In [15]:
train["label"].value_counts()

assumption    1716
anecdote       222
statistics      49
other           23
testimony       20
Name: label, dtype: int64

In [16]:
test["label"].value_counts()

assumption    1145
anecdote       148
statistics      33
other           15
testimony       13
Name: label, dtype: int64

In [17]:
train.to_csv("../data/cmv_train.csv", index=False)
test.to_csv("../data/cmv_test.csv", index=False)

In [26]:
df_cmv_no_nan.groupby('label', group_keys=False).apply(lambda x: x.sample(min(len(x), 2)))[["sentence", "label"]].values

array([['Solution: anyone should use the bathroom they are most comfortable using.',
        'anecdote'],
       ['&gt', 'anecdote'],
       ['Who chooses who is acceptable to live.', 'assumption'],
       ["Unisex toilets are not Men's rooms, so that'd be not an issue anymore.",
        'assumption'],
       ['Not a Trump fan, but to borrow his phrase "something is going on"   \n  \nThe Obama Administration has always been really good at being matter of fact and calling it what it is, except when it comes to Islamic Extremism.',
        'other'],
       ['I am as well, but there\'s a reason it\'s called "pro-choice."',
        'other'],
       ['There is no arguing with these people.', 'statistics'],
       ["It would (likely) become legally grey (if not illegal) if DNS services like Google, OpenDNS, ISP DNS, etc didn't recognize the domain after it was officially recognized by ICANN.",
        'statistics'],
       ['The fact is, as there is overwhelming evidence, the toughest, most 