In [1]:
import pandas as pd
import sklearn

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.options.display.max_colwidth = 10000

## Importing dataset

In [4]:
topics = ["abortion", "cloning", "death penalty", "gun control", "marijuana legalization", "minimum wage", "nuclear energy", "school uniforms"]
seeds = ["Seed1", "Seed2", "Seed3", "Seed4", "Seed5", "Seed6", "Seed7", "Seed8", "Seed9", "Seed10"]

In [5]:
def import_as_dataframe(seed, topic, size):
    if size == "large":
        input_file = f'reproduction_bertlarge/argument-classification/bert_output/{seed}/bert-{size}-topic-sentence/{topic}_test_topic/test_predictions.txt'
    elif size == "base":
        input_file = f'reproduction_bertbase/bert_output/bert_output/31May/ukp/{seed}/bert-{size}-topic-sentence/{topic}_test_topic/test_predictions.txt'
    df_current = pd.read_csv(input_file, delimiter = "\t", names=["True", "Pred", "Topic", "Claim"])
    return df_current

### Combining dataframes, bert large

#### seeds

In [6]:
seed_1_list = [import_as_dataframe('Seed1', t, 'large') for t in topics]
seed_2_list = [import_as_dataframe('Seed2', t, 'large') for t in topics]
seed_3_list = [import_as_dataframe('Seed3', t, 'large') for t in topics]
seed_4_list = [import_as_dataframe('Seed4', t, 'large') for t in topics]
seed_5_list = [import_as_dataframe('Seed5', t, 'large') for t in topics]
seed_6_list = [import_as_dataframe('Seed6', t, 'large') for t in topics]
seed_7_list = [import_as_dataframe('Seed7', t, 'large') for t in topics]
seed_8_list = [import_as_dataframe('Seed8', t, 'large') for t in topics]
seed_9_list = [import_as_dataframe('Seed9', t, 'large') for t in topics]
seed_10_list = [import_as_dataframe('Seed10', t, 'large') for t in topics]

In [7]:
df_all_seed1 = pd.concat(seed_1_list)
df_all_seed2 = pd.concat(seed_2_list)
df_all_seed3 = pd.concat(seed_3_list)
#df_all_seed4 = pd.concat(seed_4_list)
df_all_seed5 = pd.concat(seed_5_list)
df_all_seed6 = pd.concat(seed_6_list)
df_all_seed7 = pd.concat(seed_7_list)
df_all_seed8 = pd.concat(seed_8_list)
df_all_seed9 = pd.concat(seed_9_list)
df_all_seed10 = pd.concat(seed_10_list)

df_all_list = [df_all_seed1, df_all_seed2, df_all_seed3, df_all_seed5, df_all_seed6, df_all_seed7, df_all_seed8, df_all_seed9, df_all_seed10]

#### topics

In [8]:
abortion_list = [import_as_dataframe(s, 'abortion', 'large') for s in seeds]
cloning_list = [import_as_dataframe(s, 'cloning', 'large') for s in seeds]
deathpen_list  = [import_as_dataframe(s, 'death penalty', 'large') for s in seeds]
guncontrol_list  = [import_as_dataframe(s, 'gun control', 'large') for s in seeds]
marijuana_list  = [import_as_dataframe(s, 'marijuana legalization', 'large') for s in seeds]
minwage_list  = [import_as_dataframe(s, 'minimum wage', 'large') for s in seeds]
nuclear_list  = [import_as_dataframe(s, 'nuclear energy','large') for s in seeds]
schooluni_list  = [import_as_dataframe(s, 'school uniforms','large') for s in seeds]

In [9]:
df_all_abortion = pd.concat(abortion_list)
df_all_cloning = pd.concat(cloning_list)
df_all_deathpen = pd.concat(deathpen_list)
df_all_guncontrol = pd.concat(guncontrol_list)
df_all_marijuana = pd.concat(marijuana_list)
df_all_minwage = pd.concat(minwage_list)
df_all_nuclear = pd.concat(nuclear_list)

In [10]:
def add_numeric(df_list):
    for n in range(0, len(df_list)):
        df = df_list[n]
        df['True_num'] = df['True'].replace("NoArgument", 0).replace("Argument_for", 1).replace("Argument_against", 2)
        df['Pred_num'] = df['Pred'].replace("NoArgument", 0).replace("Argument_for", 1).replace("Argument_against", 2)
        df_list[n] = df
    return df_list

In [11]:
abortion_list_num = add_numeric(abortion_list)
guncontrol_list_num = add_numeric(guncontrol_list)
deathpen_list_num = add_numeric(deathpen_list)

## Analyses

In [12]:
#df['True_num'] = df['True'].replace("NoArgument", 0).replace("Argument_for", 1).replace("Argument_against", 2)

def check_frequency_error(topic_seedlist):
    print(f'number of seeds: {len(topic_seedlist)}')
    exam_score = topic_seedlist[0][['Claim', 'True', 'Pred']]
    exam_score["Error_for_against"] = 0
    exam_score["Error_for_noArg"] = 0
    
    exam_score["Error_against_for"] = 0
    exam_score["Error_against_noArg"] = 0
    
    exam_score["Error_noArg_for"] = 0
    exam_score["Error_noArg_against"] = 0
    
    ForAgainst = [1, 2]
    ForNoArg = [1, 0]
    
    AgainstFor = [2, 1]
    AgainstNoArg = [2, 0]
    
    NoArgFor = [0, 1]
    NoArgAgainst = [0, 2]
    
    for n in range(0, len(topic_seedlist)-1):
        n_examples = len(topic_seedlist[n])
        for ex in range(0, n_examples):
                        
            error_seed = list(topic_seedlist[n][["True_num", "Pred_num"]].iloc[ex])
                        
            if error_seed == ForAgainst:
                exam_score["Error_for_against"].iloc[ex] = int(exam_score["Error_for_against"].iloc[ex]) + 1
            elif error_seed == ForNoArg:
                exam_score["Error_for_noArg"].iloc[ex] = int(exam_score["Error_for_noArg"].iloc[ex]) + 1
            ###########################################################
            elif error_seed == AgainstFor:
                exam_score["Error_against_for"].iloc[ex] = int(exam_score["Error_against_for"].iloc[ex]) + 1
            elif error_seed == AgainstNoArg:
                exam_score["Error_against_noArg"].iloc[ex] = int(exam_score["Error_against_noArg"].iloc[ex]) + 1   
            #############################################################
            elif error_seed == NoArgFor:
                exam_score["Error_noArg_for"].iloc[ex] = int(exam_score["Error_noArg_for"].iloc[ex]) + 1
            elif error_seed == NoArgAgainst:
                exam_score["Error_noArg_against"].iloc[ex] = int(exam_score["Error_noArg_against"].iloc[ex]) + 1  
                         
    return exam_score
    

In [13]:
abortion_exam = check_frequency_error(abortion_list_num)
guncontrol_exam = check_frequency_error(guncontrol_list_num)
deathpen_exam = check_frequency_error(deathpen_list_num)

number of seeds: 10
number of seeds: 10
number of seeds: 10


### Find which errors occur in the test set

In [14]:
deathpen_exam[:5]

Unnamed: 0,Claim,True,Pred,Error_for_against,Error_for_noArg,Error_against_for,Error_against_noArg,Error_noArg_for,Error_noArg_against
0,"And I 've said , I would breathe a sigh of relief if either the Supreme Court or the states themselves began to eliminate the death penalty .",Argument_against,NoArgument,0,0,6,3,0,0
1,"10/31/2016 - We thank the teachers in 8,369 schools that have used our free resources in your instructional materials .",NoArgument,NoArgument,0,0,0,0,0,0
2,Our capital punishment system is unreliable .,Argument_against,Argument_against,0,0,0,0,0,0
3,"What is life like for Zacarias Moussaoui , the member of the September 11 hijacking teams who got caught a month before the attack ?",NoArgument,NoArgument,0,0,0,0,0,0
4,"Second of all , thousands of attorneys have made their personal crusade in life the stomping out of the death penalty .",Argument_against,NoArgument,0,0,0,1,0,0


### Find which are the most frequent errors over seeds per topic (10 = occurring in all seeds)

In [15]:
def find_frequent_errors(exam, error, top=10):
    exam = exam.sort_values(error,ascending = False)
    exam = exam[:top]
    #exam = exam[(exam['Error_for_against'] == max_exam)|(exam['Error_for_noArg'] == max_exam)|(exam['Error_against_for'] == max_exam)|(exam['Error_against_noArg'] == max_exam)|(exam['Error_noArg_for'] == max_exam)|(exam['Error_noArg_against'] == max_exam)]
    return exam

In [16]:
find_frequent_errors(guncontrol_exam, 'Error_for_against', top=5)

Unnamed: 0,Claim,True,Pred,Error_for_against,Error_for_noArg,Error_against_for,Error_against_noArg,Error_noArg_for,Error_noArg_against
177,Gun accidents due to improper storage or use of firearms claim the lives of hundreds of children a year .,Argument_for,Argument_against,9,0,0,0,0,0
628,"David H. Chipman , Senior Vice President of Public Safety for ShotSpotter and former Bureau of Alcohol , Tobacco , Firearms , and Explosives ( ATF ) agent , stated that a high-capacity magazine "" turns a killer into a killing machine . """,Argument_for,Argument_against,9,0,0,0,0,0
561,"Nelson Lund , JD , PhD , Professor at George Mason University School of Law , stated , "" The right to self-defense and to the means of defending oneself is a basic natural right that grows out of the right to life "" and "" many [ gun control laws ] interfere with the ability of law-abiding citizens to defend themselves against violent criminals . """,Argument_for,Argument_against,9,0,0,0,0,0
173,The group says laws requiring background checks have prevented the purchase of guns by nearly 2 million people who should not have had them .,Argument_for,Argument_against,9,0,0,0,0,0
345,"Eventually , the untreated illnesses of these individuals could cause them to break and start utilizing the weapons that they were able to obtain while pretending to be sane .",Argument_for,Argument_against,9,0,0,0,0,0


In [17]:
find_frequent_errors(abortion_exam, 'Error_against_for', top=5)

Unnamed: 0,Claim,True,Pred,Error_for_against,Error_for_noArg,Error_against_for,Error_against_noArg,Error_noArg_for,Error_noArg_against
646,"And with 1.5 million American families wanting to adopt a child , there is no such thing as an unwanted child .",Argument_against,Argument_for,0,0,8,1,0,0
640,"If abortions are unconditionally accepted by society and easily obtainable , people will use them as a regular form of birth control .",Argument_against,Argument_for,0,0,8,1,0,0
520,"I believe that every life is an individual gift from God , and that no life is disposable ... One proposal that brings Americans together is the Pain-Capable Unborn Child Protection Act which would protect unborn children beginning at 20 weeks , or five months of pregnancy , based on their ability to feel pain .",Argument_against,Argument_for,0,0,8,1,0,0
98,"Acceptance of an implantation-based definition of "" pregnancy "" would allow abortion providers to mischaracterize pills and technologies that work after conception but before implantation as "" contraception , "" making them potentially less subject to regulation and certainly more accept-able and attractive to consumers .",Argument_against,Argument_for,0,0,8,1,0,0
261,Data compiled by Polish government agencies shows a marked decrease in maternal deaths once abortion was made illegal .,Argument_against,Argument_for,0,0,8,1,0,0


In [23]:
find_frequent_errors(abortion_exam, 'Error_against_for', top=5)

Unnamed: 0,Claim,True,Pred,Error_for_against,Error_for_noArg,Error_against_for,Error_against_noArg,Error_noArg_for,Error_noArg_against
646,"And with 1.5 million American families wanting to adopt a child , there is no such thing as an unwanted child .",Argument_against,Argument_for,0,0,8,1,0,0
640,"If abortions are unconditionally accepted by society and easily obtainable , people will use them as a regular form of birth control .",Argument_against,Argument_for,0,0,8,1,0,0
520,"I believe that every life is an individual gift from God , and that no life is disposable ... One proposal that brings Americans together is the Pain-Capable Unborn Child Protection Act which would protect unborn children beginning at 20 weeks , or five months of pregnancy , based on their ability to feel pain .",Argument_against,Argument_for,0,0,8,1,0,0
98,"Acceptance of an implantation-based definition of "" pregnancy "" would allow abortion providers to mischaracterize pills and technologies that work after conception but before implantation as "" contraception , "" making them potentially less subject to regulation and certainly more accept-able and attractive to consumers .",Argument_against,Argument_for,0,0,8,1,0,0
261,Data compiled by Polish government agencies shows a marked decrease in maternal deaths once abortion was made illegal .,Argument_against,Argument_for,0,0,8,1,0,0


### Find how often the model makes at LEAST threshold = N of the same errors in a seed

In [19]:
def same_errors_in_seeds(exam, threshold=0):
    
    error_noArg_against = len(exam[exam["Error_noArg_against"] > threshold])
    error_noArg_for = len(exam[exam["Error_noArg_for"] > threshold])
    
    noArg_true = exam["True"].value_counts()["NoArgument"]
    percentage_noArg_against = error_noArg_against / (noArg_true / 100)
    percentage_noArg_for = error_noArg_for / (noArg_true / 100)
    
    #####################################################################
    
    error_against_noArg = len(exam[exam["Error_against_noArg"] > threshold])
    error_against_for = len(exam[exam["Error_against_for"] > threshold])
    
    arg_against_true = exam["True"].value_counts()["Argument_against"]
    percentage_against_noArg = error_against_noArg / (arg_against_true / 100)
    percentage_against_for = error_against_for / (arg_against_true / 100)
    
    #####################################################################
    
    error_for_noArg = len(exam[exam["Error_for_noArg"] > threshold])
    error_for_against = len(exam[exam["Error_for_against"] > threshold])
    
    arg_for_true = exam["True"].value_counts()["Argument_for"]
    percentage_for_noArg = error_for_noArg / (arg_for_true / 100)
    percentage_for_against = error_for_against / (arg_for_true / 100)
    
    print(f'{error_noArg_against} of the same noArg/Against errors are in at least {threshold}/10 seeds. This is {percentage_noArg_against} % of the {noArg_true} no argument test sentences.')
    print(f'{error_noArg_for} of the same noArg/For errors are in at least {threshold}/10 seeds. This is {percentage_noArg_for} % of the {noArg_true} no argument test sentences')
    print()
    print(f'{error_against_noArg } of the same against/noArg errors are in at least {threshold}/10 seeds. This is {percentage_against_noArg} % of the {arg_against_true} against test sentences')
    print(f'{error_against_for} of the same against/for errors are in at least {threshold}/10 seeds. This is {percentage_against_for} % of the {arg_against_true} against test sentences')
    print()
    print(f'{error_for_noArg} of the same for/noArg errors are in at least {threshold}/10 seeds. This is {percentage_for_noArg} % of the {arg_for_true} For test sentences')
    print(f'{error_for_against} of the same for/against errors are in at least {threshold}/10 seeds. This is {percentage_for_against} % of the {arg_for_true} For test sentences')
    

In [20]:
same_errors_in_seeds(abortion_exam, threshold=5)

22 of the same noArg/Against errors are in at least 5/10 seeds. This is 5.045871559633027 % of the 436 no argument test sentences.
15 of the same noArg/For errors are in at least 5/10 seeds. This is 3.440366972477064 % of the 436 no argument test sentences

54 of the same against/noArg errors are in at least 5/10 seeds. This is 36.486486486486484 % of the 148 against test sentences
15 of the same against/for errors are in at least 5/10 seeds. This is 10.135135135135135 % of the 148 against test sentences

35 of the same for/noArg errors are in at least 5/10 seeds. This is 27.559055118110237 % of the 127 For test sentences
34 of the same for/against errors are in at least 5/10 seeds. This is 26.771653543307085 % of the 127 For test sentences


In [21]:
same_errors_in_seeds(guncontrol_exam, threshold=8)

16 of the same noArg/Against errors are in at least 8/10 seeds. This is 4.2328042328042335 % of the 378 no argument test sentences.
0 of the same noArg/For errors are in at least 8/10 seeds. This is 0.0 % of the 378 no argument test sentences

19 of the same against/noArg errors are in at least 8/10 seeds. This is 14.285714285714285 % of the 133 against test sentences
0 of the same against/for errors are in at least 8/10 seeds. This is 0.0 % of the 133 against test sentences

18 of the same for/noArg errors are in at least 8/10 seeds. This is 11.39240506329114 % of the 158 For test sentences
44 of the same for/against errors are in at least 8/10 seeds. This is 27.848101265822784 % of the 158 For test sentences


In [22]:
same_errors_in_seeds(deathpen_exam, threshold=5)

38 of the same noArg/Against errors are in at least 5/10 seeds. This is 9.718670076726342 % of the 391 no argument test sentences.
33 of the same noArg/For errors are in at least 5/10 seeds. This is 8.439897698209718 % of the 391 no argument test sentences

40 of the same against/noArg errors are in at least 5/10 seeds. This is 17.54385964912281 % of the 228 against test sentences
44 of the same against/for errors are in at least 5/10 seeds. This is 19.29824561403509 % of the 228 against test sentences

13 of the same for/noArg errors are in at least 5/10 seeds. This is 12.62135922330097 % of the 103 For test sentences
21 of the same for/against errors are in at least 5/10 seeds. This is 20.388349514563107 % of the 103 For test sentences
