# Building ANCOR Dataset

In [None]:
from french_crs.ancor2dataset import dataset_builder

# Do-able with two nested loops
# for method in ["balanced", "representative", "window"]:
#     for subcorpus in [["INDIRECTE"],["DIRECTE"],["ANAPHORE"]]:

dataset_params={
    # There are other three possible strategies: (1)"balanced", (2)"representative" and (3)"window"
    "strategy" : "balanced,
    "ancor_corpus_path" : "../DISTRIB_ANCOR/",
    # All possibilities are: ["corpus_OTG","corpus_UBS","corpus_ESLO" ,"corpus_ESLO_CO2"]
    "sub_corpus_filter" : ["corpus_OTG","corpus_UBS","corpus_ESLO" ,"corpus_ESLO_CO2"], 
    # All possibilities are: ["DIRECTE", "INDIRECTE", "ANAPHORE"]
    "coreference_type_filter" : ["DIRECTE"], 
    "dataset_output_folder" : "../datasets/",
    # This will be used only in window strategy
    "window_size" : 5 
}

dataset = dataset_builder(**dataset_params)
dataset.build_dataset(file_analysis_alert=True)
dataset.merge_dataset(delete_original_after_merge=False)

# Dataset Splitting

In [None]:
# Two ways of Training: (1) commented codes down with parameter.
#                       (2) uncommented with mentioned explicity the config file.

from french_crs.model_training import dataset_splitter

# ds1=dataset_splitter("../datasets/balanced_Full/balanced_Full_Merged.xlsx",
#                     "../datasets/balanced_Full/balanced_Full_Train.xlsx",
#                     "../datasets/balanced_Full/balanced_Full__Test.xlsx",
#                       split_config_json="../datasets/split_config.json")

# ds1.dataset_splitter_by_file(lower_rate=0.20,upper_rate=0.21, files_num=30)


ds2=dataset_splitter("../datasets/window_INDIRECTE/window_INDIRECTE_Merged.xlsx",
                    "../datasets/window_INDIRECTE/window_INDIRECTE_Train.xlsx",
                    "../datasets/window_INDIRECTE/window_INDIRECTE_Test.xlsx",
                      split_config_json="../datasets/split_config.json")

dict_files=ds2.dataset_splitter_by_json_config()


# Model Training

In [None]:
from french_crs.model_training import model_trainer

"""
12 possibilities for "train" and "test" variables

balanced_ANAPHORE
balanced_DIRECTE
balanced_Full
balanced_INDIRECTE
representative_ANAPHORE
representative_DIRECTE
representative_Full
representative_INDIRECTE
window_ANAPHORE
window_DIRECTE
window_Full
window_INDIRECTE
"""

train="balanced_DIRECTE"
test="balanced_DIRECTE"

model=model_trainer( "../datasets/"+ train+"/"+train+"_Train.xlsx",
                     "../datasets/"+ test+"/"+test+"_Test.xlsx",
                     "../datasets/"+ train+"/"+train+"_Test_Pred.xlsx",
                     "IS_CO_REF",
                     "IS_CO_REF"
                   )

model.columns_drop_list = ['m1_DEF', 'm2_DEF', 'ID_DEF',
                           'DISTANCE_MENTION','DISTANCE_WORD',
                           'DISTANCE_CHAR']

model.convert_columns_to_numeric()
print("Model_Name :","Model_ANCOR_"+train+".model\n")
print("Train Dataset :",train+"_Train.xlsx")
print("Test  Dataset :",test+"_Test.xlsx")
performance=model.train_model_random_forest(model_name="../pre-trained language models/Model_ANCOR_"+train+".model",max_depth=10, random_state=0,n_estimators=250)
performance

# SCORCH Chains Building

In [None]:
from french_crs.pairs2chains import chains_builder

model_chains=chains_builder(path_gold_file="../datasets/"+ train+"/"+train+"_Test_Pred.xlsx", 
                            path_model_file="../datasets/"+ train+"/"+train+"_Test_Pred.xlsx",
                            gold_column="IS_CO_REF",
                            model_column="Prediction",
                            scorch_output_path="../",
                            threshold=0.5)

model_chains.generate_gold_model_json_output(mode="train")

# SCORCH Outcome

In [None]:
import os
bashCommand = "scorch ../coref_chains_gold.json ../coref_chains_pred.json > ../mm.txt"
os.system(bashCommand)
f = open("../mm.txt",'r')
message = f.read()
print(message)

# Putting All Chains Together

In [None]:
import docx
import os
from french_crs.model_training import model_trainer
from french_crs.pairs2chains import chains_builder

mydoc = docx.Document()

style = mydoc.styles['Normal']
font = style.font
font.name = 'MS Gothic'
font.size = docx.shared.Pt(10)


train_test_list=[
    "balanced_ANAPHORE",
    "balanced_DIRECTE",
    "balanced_Full",
    "balanced_INDIRECTE",
    "representative_ANAPHORE",
    "representative_DIRECTE",
    "representative_Full",
    "representative_INDIRECTE",
    "window_ANAPHORE",
    "window_DIRECTE",
    "window_Full",
    "window_INDIRECTE"
    ]

train_test_list=[
    ["balanced_ANAPHORE","window_ANAPHORE"],
    ["balanced_DIRECTE","window_DIRECTE"],
    ["balanced_Full","window_Full"],
    ["balanced_INDIRECTE","window_INDIRECTE"],
    ["representative_ANAPHORE","window_ANAPHORE"],
    ["representative_DIRECTE","window_DIRECTE"],
    ["representative_Full","window_Full"],
    ["representative_INDIRECTE","window_INDIRECTE"],
    ["window_ANAPHORE","window_ANAPHORE"],
    ["window_DIRECTE","window_DIRECTE"],
    ["window_Full","window_Full"],
    ["window_INDIRECTE","window_INDIRECTE"]
    ]

counter=0

# for train in train_test_list:
#     for test in train_test_list:

for train, test in train_test_list:
    counter+=1

    model=model_trainer( "../datasets/"+ train+"/"+train+"_Train.xlsx",
                 "../datasets/"+ test+"/"+test+"_Test.xlsx",
                 "../datasets/"+ train+"/"+train+"_Test_Pred.xlsx",
                 "IS_CO_REF",
                 "IS_CO_REF"
               )

    model.columns_drop_list = ['m1_DEF', 'm2_DEF', 'ID_DEF',
                               'DISTANCE_MENTION','DISTANCE_WORD',
                               'DISTANCE_CHAR']

    model.convert_columns_to_numeric()
    performance=model.train_model_random_forest(model_name="../pre-trained language models/Model_ANCOR_"+train+".model",max_depth=10, random_state=0,n_estimators=250)


    model_chains=chains_builder(path_gold_file="../datasets/"+ train+"/"+train+"_Test_Pred.xlsx", 
                        path_model_file="../datasets/"+ train+"/"+train+"_Test_Pred.xlsx",
                        gold_column="IS_CO_REF",
                        model_column="Prediction",
                        scorch_output_path="../",
                        threshold=0.5)

    model_chains.generate_gold_model_json_output(mode="train")


    bashCommand = "scorch ../coref_chains_gold.json ../coref_chains_pred.json > ../mehdi.txt"
    os.system(bashCommand)
    f = open("../mehdi.txt",'r')
    message = f.read()
    f.close()


    mydoc.add_paragraph("Model_Name : "+"Model_ANCOR_"+train+".model")
    mydoc.add_paragraph("Train Dataset : "+train+"_Train.xlsx")
    mydoc.add_paragraph("Test  Dataset : "+test+"_Test.xlsx")
    mydoc.add_paragraph("\n")
    mydoc.add_paragraph(str(performance))
    mydoc.add_paragraph("\n")
    mydoc.add_paragraph(message)
    mydoc.add_page_break()

    print(counter)


mydoc.save("../pre-trained language models/Performance Analysis.docx")

#  Model Testing

In [None]:
from model_training import model_tester

model_parameter={
                "model_name" : "./Models/Random_Forest_(Normal)_OTG_Neg_90_Pos_10.model",
                "input_file" : "./Datasets/corpus_ALL_Window_30_Test.xlsx",
                "output_file" : "./Datasets/corpus_ALL_Window_30_Test_Called_Seperately.xlsx",
                "column_gold" : "IS_CO_REF",
                "column_outcome" : "Prediction",
                "threshold" : 0.5
                }

model=model_tester(**model_parameter)
model.apply_model_to_dataset()