In [None]:
!gdown --folder 1TJMndsdyWv5sviPoHcVghoqD4FK9Qpk6 >> log.txt

In [None]:
!pip install datasets >> log.txt

In [None]:
import numpy as np
import os
SEED = 12345

In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# load dataset
dataset = load_dataset("NoraAlt/Mawqif_Stance-Detection")

# convert to pandas dataframe
df = pd.DataFrame({k: dataset['train'][k] for k, _ in dataset['train'].features.items()})
df['stance'] = df['stance'].apply(lambda x: "Neutral" if x is None else x)

# train test split
train_df, test_df = train_test_split(df, test_size=500, random_state=12345)

# print sizes
print(f"train length: {len(train_df)}")
print(f"test length: {len(test_df)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3502 [00:00<?, ? examples/s]

train length: 3002
test length: 500


In [None]:
# prompt: convert train_df stance to another column that is integers

stance_to_int = {
  "Against": 0,
  "Favor": 1,
  "Neutral": 2
}
int_to_stance = {value: key for key, value in stance_to_int.items()}

train_df['stance_int'] = train_df['stance'].map(stance_to_int)
test_df['stance_int'] = test_df['stance'].map(stance_to_int)

In [None]:
embeddings = {}
for file_name in os.listdir("embeddings"):
    if "train" in file_name and file_name.endswith(".npy"):
        base_name = file_name.split("_train")[0]
        train = f"{base_name}_train.npy"
        test = f"{base_name}_test.npy"
        embeddings[base_name] = {"train": np.load(os.path.join("embeddings", train)), "test": np.load(os.path.join("embeddings", test))}

In [None]:
from sklearn.metrics import f1_score, classification_report
def evaluate(test_df, y_pred):
    sum_f2_final = 0
    sum_f3_final = 0
    results = {}
    for target in test_df["target"].unique():
        target_indices = [i for i in range(len(test_df['target'].tolist())) if test_df['target'].tolist()[i] == target]
        filtered_test_labels = [test_df['stance_int'].tolist()[i] for i in target_indices]
        filtered_predictions = [y_pred[i] for i in target_indices]
        # print(classification_report(filtered_test_labels, filtered_predictions))
        f1_3class = f1_score(filtered_test_labels, filtered_predictions, average = None)
        sum_f2_final += (f1_3class[0] + f1_3class[1])/2
        sum_f3_final += sum(f1_3class)/3
        results[target] = {"F1_score_2class": (f1_3class[0] + f1_3class[1])/2, "F1_score_3class": sum(f1_3class)/3}
    results["All Targets"] = {"F1_score_2class": sum_f2_final/3, "F1_score_3class": sum_f3_final/3}
    return results

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
# Define your neural network model
class MLP_zero_layer(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP_zero_layer, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        return x

def pytorch_model_train_predict(model, X_train, y_train, X_test, num_epochs = 100, lr = 0.001):
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

    # Define your model, loss function, and optimizer
    input_size = X_train.shape[1]
    output_size = len(np.unique(y_train))  # number of unique classes
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    n_iter_no_change = 10
    tol = 1e-4
    best_loss = 100000000
    no_improvement = 0

    # Training loop
    num_epochs = num_epochs  # you can adjust this as needed
    for epoch in range(num_epochs):
        iter_loss = 0
        for i in range(0, len(X_train_tensor), 200):
            X_train_tensor_batch = X_train_tensor[i:i+200]
            y_train_tensor_batch = y_train_tensor[i:i+200]
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor_batch)
            loss = criterion(outputs, y_train_tensor_batch)
            loss.backward()
            optimizer.step()
            iter_loss += float(loss)

        # Early stopping
        if iter_loss < best_loss - tol:
            best_loss = iter_loss
            no_improvement = 0
        else:
            no_improvement += 1
            if no_improvement >= n_iter_no_change:
                print(f'Early stopping at epoch {epoch} as there is no improvement in loss.')
                break

    # Evaluation
    model.eval()
    with torch.no_grad():
        y_pred_tensor = model(X_test_tensor)
        _, predicted = torch.max(y_pred_tensor, 1)
        y_pred = predicted.numpy()
    return model, y_pred

def pytorch_zero_model_train_predict(X_train, y_train, X_test, num_epochs = 100, lr = 0.001, hidden_layer_sizes = None):
    # Define your model, loss function, and optimizer
    input_size = X_train.shape[1]
    output_size = len(np.unique(y_train))  # number of unique classes
    model = MLP_zero_layer(input_size, output_size)
    return pytorch_model_train_predict(model, X_train, y_train, X_test, num_epochs, lr)



import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers_sizes):
        super(MLP, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_layers_sizes = hidden_layers_sizes

        # Create the input layer
        self.input_layer = nn.Linear(input_size, hidden_layers_sizes[0])

        # Create the hidden layers
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_layers_sizes) - 1):
            self.hidden_layers.append(nn.Linear(hidden_layers_sizes[i], hidden_layers_sizes[i+1]))

        # Create the output layer
        self.output_layer = nn.Linear(hidden_layers_sizes[-1], output_size)

    def forward(self, x):
        # Forward pass through input layer
        x = torch.relu(self.input_layer(x))

        # Forward pass through hidden layers
        for hidden_layer in self.hidden_layers:
            x = torch.relu(hidden_layer(x))

        # Forward pass through output layer
        x = self.output_layer(x)
        return x

# # Example usage:
# input_size = 10
# output_size = 1
# hidden_layers_sizes = [20, 30, 20]

# mlp = MLP(input_size, output_size, hidden_layers_sizes)
# print(mlp)

def pytorch_hidden_layers_model_train_predict(X_train, y_train, X_test, num_epochs = 100, lr = 0.001, hidden_layer_sizes = [100]):
    # Define your model, loss function, and optimizer
    input_size = X_train.shape[1]
    output_size = len(np.unique(y_train))  # number of unique classes
    model = MLP(input_size, output_size, hidden_layer_sizes)
    return pytorch_model_train_predict(model, X_train, y_train, X_test, num_epochs, lr)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

def sklearn_logreg_model_train_predict(X_train, y_train, X_test, num_epochs = 100, lr=None, hidden_layer_sizes = None):
    clf = LogisticRegression(
        random_state=SEED,
        max_iter=num_epochs,
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return clf, y_pred

def sklearn_MLP_zero_model_train_predict(X_train, y_train, X_test, num_epochs = 100, lr = 0.001, hidden_layer_sizes = None):
    clf = MLPClassifier(
        hidden_layer_sizes=(),
        max_iter=num_epochs,
        random_state=SEED,
        learning_rate_init = lr
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return clf, y_pred


def sklearn_MLP_hidden_layers_model_train_predict(X_train, y_train, X_test, num_epochs = 100, lr = 0.001, hidden_layer_sizes = [100]):
    clf = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        max_iter=num_epochs,
        random_state=SEED,
        learning_rate_init = lr
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return clf, y_pred

In [None]:
!pip install imbalanced-learn



In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
train_df_no_index = train_df.reset_index()
train_df_cov = train_df_no_index[train_df_no_index["target"] == "Covid Vaccine"]
train_df_dig = train_df_no_index[train_df_no_index["target"] == "Digital Transformation"]
train_df_women = train_df_no_index[train_df_no_index["target"] == "Women empowerment"]

test_df_no_index = test_df.reset_index()
test_df_cov = test_df_no_index[test_df_no_index["target"] == "Covid Vaccine"]
test_df_dig = test_df_no_index[test_df_no_index["target"] == "Digital Transformation"]
test_df_women = test_df_no_index[test_df_no_index["target"] == "Women empowerment"]

def train_on_different_tagets(embed_model_name, name, func_name, X_train, X_test, num_epochs = 100, lr = 0.001, hidden_layer_sizes = None, do_smote = False):
    y_pred = np.zeros((len(X_test)))

    y_train = train_df_cov["stance_int"].tolist()
    train_embed_target = train_embed[train_df_cov.index]
    test_embed_target = test_embed[test_df_cov.index]
    if do_smote:
        train_embed_target, y_train = SMOTE().fit_resample(train_embed_target, y_train)
    model_cov, y_pred_target = func_name(train_embed_target, y_train, test_embed_target, num_epochs, lr, hidden_layer_sizes)
    y_pred[test_df_cov.index] = y_pred_target

    y_train = train_df_dig["stance_int"].tolist()
    train_embed_target = train_embed[train_df_dig.index]
    test_embed_target = test_embed[test_df_dig.index]
    if do_smote:
        train_embed_target, y_train = SMOTE().fit_resample(train_embed_target, y_train)
    model_dig, y_pred_target = func_name(train_embed_target, y_train, test_embed_target, num_epochs, lr, hidden_layer_sizes)
    y_pred[test_df_dig.index] = y_pred_target

    y_train = train_df_women["stance_int"].tolist()
    train_embed_target = train_embed[train_df_women.index]
    test_embed_target = test_embed[test_df_women.index]
    if do_smote:
        train_embed_target, y_train = SMOTE().fit_resample(train_embed_target, y_train)
    model_women, y_pred_target = func_name(train_embed_target, y_train, test_embed_target, num_epochs, lr, hidden_layer_sizes)
    y_pred[test_df_women.index] = y_pred_target

    results = evaluate(test_df, y_pred)
    if embed_model_name not in experiments:
        experiments[embed_model_name] = {}
    experiments[embed_model_name][name] = {"models": {"Digital Transformation": model_dig, "Covid Vaccine": model_cov, "Women empowerment": model_women}, "results": results, "score": results["All Targets"]["F1_score_2class"]}


In [None]:
train_df_cov.shape, train_df_dig.shape, train_df_women.shape

((976, 16), (974, 16), (1052, 16))

In [None]:
experiments = {}

# !gdown "1Ve19rpQp6KLLc_w-GiJ-RPdZVY2IUOZ1"
# import pickle
# with open('experiments.pkl', 'rb') as f:
#     experiments = pickle.load(f)


In [None]:
# from tqdm.notebook import tqdm

# for embed_model_name in tqdm(embeddings.keys()):
#     train_embed = embeddings[embed_model_name]["train"]
#     test_embed = embeddings[embed_model_name]["test"]

#     train_on_different_tagets(embed_model_name, "pytorch_zero_hidden_epochs_100_lr_0.001_per_target", pytorch_zero_model_train_predict, train_embed, test_embed, num_epochs = 100)
#     train_on_different_tagets(embed_model_name, "sklearn_logreg_epochs_100_lr_0.001_per_target", sklearn_logreg_model_train_predict, train_embed, test_embed, num_epochs = 100)
#     train_on_different_tagets(embed_model_name, "sklearn_MLP_zero_hidden_epochs_100_lr_0.001_per_target", sklearn_MLP_zero_model_train_predict, train_embed, test_embed, num_epochs = 100)

from tqdm.notebook import tqdm


for embed_model_name in tqdm(embeddings.keys()):
    train_embed = embeddings[embed_model_name]["train"]
    test_embed = embeddings[embed_model_name]["test"]
    y_train = train_df["stance_int"].tolist()
    for num_epochs in [100, 1000]:
        for hidden_layer_sizes in [[100], [300], [300, 100], [300, 100, 10]]:
            train_on_different_tagets(embed_model_name, f"pytorch_with_hidden_epochs_{num_epochs}_lr_0.001_per_target_hidden_{'_'.join([str(h) for h in hidden_layer_sizes])}_per_target", pytorch_hidden_layers_model_train_predict, train_embed, test_embed, num_epochs = 100, hidden_layer_sizes = hidden_layer_sizes)
            train_on_different_tagets(embed_model_name, f"sklearn_MLP_with_hidden_epochs_{num_epochs}_lr_0.001_per_target_hidden_{'_'.join([str(h) for h in hidden_layer_sizes])}_per_target", sklearn_MLP_hidden_layers_model_train_predict, train_embed, test_embed, num_epochs = 100, hidden_layer_sizes = hidden_layer_sizes)

            model, y_pred = pytorch_hidden_layers_model_train_predict(train_embed, y_train, test_embed, num_epochs = num_epochs, hidden_layer_sizes=hidden_layer_sizes)
            results = evaluate(test_df, y_pred)
            experiments[embed_model_name][f"pytorch_with_hidden_epochs_{num_epochs}_lr_0.001_per_target_hidden_{'_'.join([str(h) for h in hidden_layer_sizes])}"] = {"model": model, "results": results, "score": results["All Targets"]["F1_score_2class"]}

            model, y_pred = sklearn_MLP_hidden_layers_model_train_predict(train_embed, y_train, test_embed, num_epochs = num_epochs, hidden_layer_sizes=hidden_layer_sizes)
            results = evaluate(test_df, y_pred)
            experiments[embed_model_name][f"sklearn_MLP_with_hidden_epochs_{num_epochs}_lr_0.001_per_target_hidden_{'_'.join([str(h) for h in hidden_layer_sizes])}"] = {"model": model, "results": results, "score": results["All Targets"]["F1_score_2class"]}

In [None]:
_max = 0
for embed, embed_res in experiments.items():
    for experiment, results in embed_res.items():
        _max = max(_max, results["score"])
        print(f'{embed}\t{experiment}\t{results["score"]}')
_max

paraphrase-multilingual-mpnet-base-v2	pytorch_with_hidden_epochs_100_lr_0.001_per_target_hidden_100	0.7071888367908361
paraphrase-multilingual-mpnet-base-v2	sklearn_MLP_with_hidden_epochs_100_lr_0.001_per_target_hidden_100	0.7085657144065359
paraphrase-multilingual-mpnet-base-v2	pytorch_with_hidden_epochs_100_lr_0.001_per_target_hidden_100_per_target	0.6808949602882461
paraphrase-multilingual-mpnet-base-v2	sklearn_MLP_with_hidden_epochs_100_lr_0.001_per_target_hidden_100_per_target	0.708503598468126
paraphrase-multilingual-mpnet-base-v2	pytorch_with_hidden_epochs_100_lr_0.001_per_target_hidden_300_per_target	0.684792455839366
paraphrase-multilingual-mpnet-base-v2	sklearn_MLP_with_hidden_epochs_100_lr_0.001_per_target_hidden_300_per_target	0.6953989062634034
paraphrase-multilingual-mpnet-base-v2	pytorch_with_hidden_epochs_100_lr_0.001_per_target_hidden_300	0.6783698333079603
paraphrase-multilingual-mpnet-base-v2	sklearn_MLP_with_hidden_epochs_100_lr_0.001_per_target_hidden_300	0.6798945

0.7499120107123547

In [None]:
# prompt: save experiments dict using pickles into file
import pickle

with open('experiments.pkl', 'wb') as f:
    pickle.dump(experiments, f)
