# Import

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support


In [2]:
import warnings
warnings.filterwarnings('ignore')

# Utils

In [3]:
def calculate_metrics(TN, FP, FN, TP, f_beta=1):
    Acc = (TP+TN)/(TN+FP+FN+TP)
    P = TP/(TP+FP)
    R = TP/(TP+FN)
    F = (1+pow(f_beta,2))*(P*R)/((pow(f_beta,2))*P+R)
    TNR = TN/(TN+FN)
    return Acc, P, R, F, TNR


def train_test_split_idxs(correct_col, test_ratio=0.25, balanced=True):
    label_0 = list(np.argwhere(correct_col==0)[:,0])
    label_1 = list(np.argwhere(correct_col==1)[:,0])

    if balanced:
        sample_size = min(len(label_0), len(label_1))
        label_0 = random.sample(label_0, sample_size)
        label_1 = random.sample(label_1, sample_size)
    
    test_idxs_0 = random.sample(label_0, int(test_ratio*len(label_0)))
    test_idxs_1 = random.sample(label_1, int(test_ratio*len(label_1)))
    train_idxs_0 = list(set(label_0)-set(test_idxs_0))
    train_idxs_1 = list(set(label_1)-set(test_idxs_1))

    test_idxs = test_idxs_0 + test_idxs_1
    train_idxs = train_idxs_0 + train_idxs_1

    random.shuffle(test_idxs)
    random.shuffle(train_idxs)

    # print(f"tr_0: {len(train_idxs_0)}", end=" ")
    # print(f"tr_1: {len(train_idxs_1)}", end=" ")
    # print(f"ts_0: {len(test_idxs_0)}", end=" ")
    # print(f"ts_1: {len(test_idxs_1)}", end=" ")
    # print()

    # train_idxs, test_idxs = train_test_split(labels_idxs, test_size=int(test_ratio*len(labels_idxs)))
    # test_idxs = random.sample(labels_idxs, int(test_ratio*len(labels_idxs)))
    # train_idxs = list(set(labels_idxs)-set(test_idxs))

    return train_idxs, test_idxs

# Define variables

In [4]:
model_names = ["Vilt", "Blip_large", "GiT_base", "GiT_large"] # skip , "Blip_base"
model_type = "VQA"
D_type = "1"

full_df_columns = ["feature", "model_name", "classifier", "TP", "TN", "FP", "FN", "Acc", "P", "R", "F", "TNR"]

numeric_cols = ["used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty", "difficulty", \
                "no_tokens", "correct", "N", "Prob", "P_T_1", 'P_T_2_N']
numeric_cols_dtype = dict()
for c in numeric_cols: numeric_cols_dtype[c]=float
numeric_cols_dtype

{'used_language': float,
 'specificity': float,
 'question_length': float,
 'complexity': float,
 'image_relatedness': float,
 'image_difficulty': float,
 'difficulty': float,
 'no_tokens': float,
 'correct': float,
 'N': float,
 'Prob': float,
 'P_T_1': float,
 'P_T_2_N': float}

# Read full results df

In [5]:
full_results_df = pd.read_excel(f"./{model_type}_full_results_D_type_{D_type}.xlsx", sheet_name=f"{model_type}_D_type_{D_type}_results",
                                dtype=numeric_cols_dtype)

full_results_df.head()

Unnamed: 0,ID,image_name,example_question,question,answer_gt,used_language,specificity,question_length,complexity,image_relatedness,...,P_T_2_N,x_max_str,x_min_str,Prob_str,T_1_max_str,T_1_str,P_T_1_str,T_2_max_N_str,T_2_N_str,P_T_2_N_str
0,0,Gaussian_5_2,what is in the image?,what is in the image?,random+noise+nothing+t know+not sure+unknown+c...,0.0,1.0,0.0,0.0,0.0,...,0.035396,-2.3610375,-16.713715,0.07764137,1.7832804974941396,1.3263668739558916,0.033901606,1.738079,1.307533,0.035396315
1,1,Gaussian_5_2,what is the dominant color of the image?,what is the dominant color of the image?,color+gray+grey+nothing+t know+not sure+unknow...,0.0,1.0,0.0,0.0,0.0,...,0.16671,-0.49326575,-27.210875,0.20900321,3.319589136322892,1.966495473467872,0.08871711,1.738079,1.307533,0.16670989
2,2,Gaussian_5_2,what does the image represent?,what does the image represent?,random+noise+nothing+t know+not sure+unknown+c...,0.0,1.0,0.0,0.0,0.0,...,0.039332,-2.180196,-17.993324,0.08220834,1.964737514053651,1.4019739641890212,0.031976696,1.738079,1.307533,0.03933237
3,3,Gaussian_5_2,why is the image random?,why is the image random?,random+t know+not sure+unknown+can't tell+none...,0.0,1.0,0.0,0.0,0.0,...,0.051415,-2.5192337,-19.845095,0.12248334,2.152690347564782,1.4802876448186593,0.034029774,1.738079,1.307533,0.05141516
4,4,Gaussian_5_2,why aren't there any objects in the image?,why aren't there any objects in the image?,random+noise+t know+not sure+unknown+can't tel...,0.0,1.0,0.0,0.0,0.0,...,0.029139,-3.1362438,-18.810205,0.06400901,1.9474464197595112,1.394769341566463,0.024084808,1.738079,1.307533,0.02913933


# Add valid column

In [6]:
not_nan = np.array(~(full_results_df['clean_answer']).isna())
not_qst_mark = np.array(~(full_results_df['clean_answer'].str.contains('?', na=True, regex=False)))

valid = np.where(not_nan & not_qst_mark, 1, 0)

data = full_results_df.copy()
data["valid"] = valid

len(data)

14276

In [7]:
data["no_tokens"] = data["no_tokens"]/np.max(list(data["no_tokens"]))

In [8]:
np.sum(list(data.loc[data["valid"]==1]["correct"]))

3334.0

# Train Test indices

In [9]:
models_idxs = dict()

## Full set

In [10]:
correct_col = np.array(data.loc[(data["valid"]==1)]["correct"]).astype(np.int32)
train_idxs, test_idxs = train_test_split_idxs(correct_col, test_ratio=0.25, balanced=False)

models_idxs["all"] = [train_idxs, test_idxs]

## Models

In [11]:
for model_name in model_names:
    correct_col = np.array(data.loc[(data["valid"]==1) & (data["model_name"]==model_name)]["correct"]).astype(np.int32)
    train_idxs, test_idxs = train_test_split_idxs(correct_col, test_ratio=0.25, balanced=False)
    models_idxs[model_name] = [train_idxs, test_idxs]

# Classifiers

In [12]:
models_all_results = dict()

## Feature: Prob

In [13]:
features = ["Prob"]
models_results = dict()

In [14]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [15]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [16]:
models_all_results["Prob"] = models_results.copy()

## Feature: P_T_1

In [17]:
features = ["P_T_1"]
models_results = dict()

In [18]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [19]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [20]:
models_all_results["P_T_1"] = models_results.copy()

## Feature: P_T_2_N

In [21]:
features = ["P_T_2_N"]
models_results = dict()

In [22]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [23]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [24]:
models_all_results["P_T_2_N"] = models_results.copy()

## Feature: diff

In [25]:
features = ["difficulty"]
models_results = dict()
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()

In [26]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [27]:
models_all_results["diff"] = models_results.copy()

## Feature: Prob, diff

In [28]:
features = ["Prob", "difficulty"]
models_results = dict()

In [29]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [30]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [31]:
models_all_results["Prob_diff"] = models_results.copy()

## Feature: All diff

In [32]:
features = ["used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty"]
models_results = dict()

In [33]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [34]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [35]:
models_all_results["all_diff"] = models_results.copy()

## Feature: Prob, All diff

In [36]:
features = ["Prob", "used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty"]
models_results = dict()

In [37]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [38]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [39]:
models_all_results["Prob_all_diff"] = models_results.copy()

## Feature: with tokens

In [40]:
features = ["Prob", "used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty", "no_tokens"]
models_results = dict()

In [41]:
for model_name in model_names:
    
    model_dict = dict()

    temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()


In [42]:
model_dict = dict()

model_name = "all"

temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

train_idxs, test_idxs = models_idxs[model_name]
X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

for m_name, model in models.items():
    model = model()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    model_dict[m_name]={"preds":predictions, "gt":y_test}

models_results[model_name] = model_dict.copy()

In [43]:
models_all_results["w_tokens"] = models_results.copy()

# Write results

In [44]:
models_all_results["w_tokens"]["Blip_large"]["MLP"].keys()

dict_keys(['preds', 'gt'])

In [45]:
all_rows = list()
for feature, models_results in models_all_results.items():
    for model_name in model_names+["all"]:
        model_results = models_results[model_name]
        for m_name, results in model_results.items():
            one_row = list()
            TN, FP, FN, TP = confusion_matrix(results["gt"], results["preds"]).ravel()
            Acc, P, R, F, TNR = calculate_metrics(TN, FP, FN, TP, f_beta=0.5)
            one_row = [feature, model_name, m_name, TP, TN, FP, FN, Acc, P, R, F, TNR]
            all_rows.append(one_row)

In [46]:
full_df = pd.DataFrame(np.array(all_rows), columns=full_df_columns)
full_df.head()

Unnamed: 0,feature,model_name,classifier,TP,TN,FP,FN,Acc,P,R,F,TNR
0,Prob,Vilt,LogReg,16,680,21,174,0.7811447811447811,0.4324324324324324,0.0842105263157894,0.2366863905325444,0.7962529274004684
1,Prob,Vilt,MLP,13,682,19,177,0.7800224466891134,0.40625,0.0684210526315789,0.2044025157232704,0.7939464493597206
2,Prob,Blip_large,LogReg,25,634,28,204,0.7396184062850729,0.4716981132075472,0.1091703056768559,0.2834467120181406,0.7565632458233891
3,Prob,Blip_large,MLP,30,632,30,199,0.7429854096520763,0.5,0.131004366812227,0.3198294243070362,0.7605294825511432
4,Prob,GiT_base,LogReg,50,513,34,183,0.7217948717948718,0.5952380952380952,0.2145922746781116,0.4393673110720562,0.7370689655172413


In [47]:
num_cols = ["TP", "TN", "FP", "FN", "Acc", "P", "R", "F", "TNR"]
full_df[num_cols] = full_df[num_cols].apply(pd.to_numeric)

In [48]:
full_df

Unnamed: 0,feature,model_name,classifier,TP,TN,FP,FN,Acc,P,R,F,TNR
0,Prob,Vilt,LogReg,16,680,21,174,0.781145,0.432432,0.084211,0.236686,0.796253
1,Prob,Vilt,MLP,13,682,19,177,0.780022,0.406250,0.068421,0.204403,0.793946
2,Prob,Blip_large,LogReg,25,634,28,204,0.739618,0.471698,0.109170,0.283447,0.756563
3,Prob,Blip_large,MLP,30,632,30,199,0.742985,0.500000,0.131004,0.319829,0.760529
4,Prob,GiT_base,LogReg,50,513,34,183,0.721795,0.595238,0.214592,0.439367,0.737069
...,...,...,...,...,...,...,...,...,...,...,...,...
75,w_tokens,GiT_base,MLP,136,523,24,97,0.844872,0.850000,0.583691,0.778923,0.843548
76,w_tokens,GiT_large,LogReg,94,300,38,86,0.760618,0.712121,0.522222,0.663842,0.777202
77,w_tokens,GiT_large,MLP,115,309,29,65,0.818533,0.798611,0.638889,0.760582,0.826203
78,w_tokens,all,LogReg,348,2091,159,485,0.791113,0.686391,0.417767,0.608179,0.811724


In [49]:
# # i = i+1
# i = 0
# print(i)
# with pd.ExcelWriter(f"./{model_type}_classifier_results_not_balanced.xlsx", engine="openpyxl", mode="w") as writer:
#     full_df.to_excel(writer, sheet_name=f"{model_type}_classifier_results_{i}", index_label='ID')

# Loop!!!

In [50]:
balanced = False

for i in tqdm(range(10)):
    # Train Test indices
    models_idxs = dict()
    ## Full set
    correct_col = np.array(data.loc[(data["valid"]==1)]["correct"]).astype(np.int32)
    train_idxs, test_idxs = train_test_split_idxs(correct_col, test_ratio=0.25, balanced=balanced)

    models_idxs["all"] = [train_idxs, test_idxs]
    ## Models
    for model_name in model_names:
        correct_col = np.array(data.loc[(data["valid"]==1) & (data["model_name"]==model_name)]["correct"]).astype(np.int32)
        train_idxs, test_idxs = train_test_split_idxs(correct_col, test_ratio=0.25, balanced=balanced)
        models_idxs[model_name] = [train_idxs, test_idxs]
    # Classifiers
    models_all_results = dict()
    ## Feature: Prob
    features = ["Prob"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["Prob"] = models_results.copy()
    ## Feature: P_T_1
    features = ["P_T_1"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["P_T_1"] = models_results.copy()
    ## Feature: P_T_2_N
    features = ["P_T_2_N"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["P_T_2_N"] = models_results.copy()
    ## Feature: diff
    features = ["difficulty"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()
    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["diff"] = models_results.copy()
    ## Feature: Prob, diff
    features = ["Prob", "difficulty"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["Prob_diff"] = models_results.copy()
    ## Feature: All diff
    features = ["used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["all_diff"] = models_results.copy()
    ## Feature: Prob, All diff
    features = ["Prob", "used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["Prob_all_diff"] = models_results.copy()
    ## Feature: with tokens
    features = ["Prob", "used_language", "specificity", "question_length", "complexity", "image_relatedness", "image_difficulty", "no_tokens"]
    models_results = dict()
    for model_name in model_names:
        
        model_dict = dict()

        temp = data.loc[(data["model_name"]==model_name) & (data["valid"]==1)] #  & (data['image_difficulty']!=2)

        train_idxs, test_idxs = models_idxs[model_name]
        X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
        y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

        models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

        for m_name, model in models.items():
            model = model()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            model_dict[m_name]={"preds":predictions, "gt":y_test}

        models_results[model_name] = model_dict.copy()

    model_dict = dict()

    model_name = "all"

    temp = data.loc[(data["valid"]==1)] #  & (data['image_difficulty']!=2)

    train_idxs, test_idxs = models_idxs[model_name]
    X_train, X_test = np.array(temp[features])[train_idxs], np.array(temp[features])[test_idxs]
    y_train, y_test = np.array(temp["correct"])[train_idxs], np.array(temp["correct"])[test_idxs]

    models = {"LogReg": LogisticRegression, "MLP":MLPClassifier}

    for m_name, model in models.items():
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        model_dict[m_name]={"preds":predictions, "gt":y_test}

    models_results[model_name] = model_dict.copy()
    models_all_results["w_tokens"] = models_results.copy()
    # Write results
    all_rows = list()
    for feature, models_results in models_all_results.items():
        for model_name in model_names+["all"]:
            model_results = models_results[model_name]
            for m_name, results in model_results.items():
                one_row = list()
                TN, FP, FN, TP = confusion_matrix(results["gt"], results["preds"]).ravel()
                Acc, P, R, F, TNR = calculate_metrics(TN, FP, FN, TP, f_beta=0.5)
                one_row = [feature, model_name, m_name, TP, TN, FP, FN, Acc, P, R, F, TNR]
                all_rows.append(one_row)
    full_df = pd.DataFrame(np.array(all_rows), columns=full_df_columns)
    full_df.head()
    num_cols = ["TP", "TN", "FP", "FN", "Acc", "P", "R", "F", "TNR"]
    full_df[num_cols] = full_df[num_cols].apply(pd.to_numeric)
    full_df
    try:
        with pd.ExcelWriter(f"./{model_type}_classifier_results_not_balanced.xlsx", engine="openpyxl", mode="a") as writer:
            full_df.to_excel(writer, sheet_name=f"{model_type}_classifier_results_{i}", index_label='ID')
    except:
        with pd.ExcelWriter(f"./{model_type}_classifier_results_not_balanced.xlsx", engine="openpyxl", mode="w") as writer:
            full_df.to_excel(writer, sheet_name=f"{model_type}_classifier_results_{i}", index_label='ID')

    print(i)

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [01:50<16:37, 110.79s/it]

0


 20%|██        | 2/10 [04:08<16:51, 126.50s/it]

1


 30%|███       | 3/10 [06:40<16:08, 138.31s/it]

2


 40%|████      | 4/10 [08:42<13:11, 131.91s/it]

3


 50%|█████     | 5/10 [10:51<10:53, 130.79s/it]

4


 60%|██████    | 6/10 [12:58<08:38, 129.63s/it]

5


 70%|███████   | 7/10 [15:06<06:26, 128.98s/it]

6


 80%|████████  | 8/10 [17:49<04:39, 139.73s/it]

7


 90%|█████████ | 9/10 [20:14<02:21, 141.35s/it]

8


100%|██████████| 10/10 [22:44<00:00, 136.46s/it]

9



