In [None]:
from transformers import pipeline
from utils.preprocess_dataset import preprocess_dataframe
import pandas as pd
from tqdm import tqdm
import os

In [None]:
eval_dataframes = []

for filename in tqdm(os.listdir("cleaned_data/eval_data")):
    if filename.endswith(".csv"):
        eval_dataframes.append(preprocess_dataframe(f"cleaned_data/eval_data/{filename}", "eval"))

eval_data = pd.concat(eval_dataframes, ignore_index=True)

In [None]:
eval_data.head()

## First Approach

In this approach we use the distilbert version that we finetuned to infer each tweet in a period and then we do an ensembling for the prediction of the single period.

In [None]:
classifier = pipeline('sentiment-analysis', model='model_output/checkpoint-288718', device="mps")

In [None]:
def predict_event(text):
    predictions = classifier(text)
    label_1_prediction = 0
    for prediction in predictions:
        if prediction['label'] == 'LABEL_1':
            label_1_prediction += prediction['score']
        else: 
            label_1_prediction += 1 - prediction['score']
    
    label_1_prediction = label_1_prediction/len(predictions)
    return {"label": 1, "score": label_1_prediction} if label_1_prediction > 0.5 else {"label": 0, "score": 1-label_1_prediction}

In [None]:
tqdm.pandas()
eval_data["Result_app_1"] = eval_data["Tweet"].progress_apply(predict_event)

In [None]:
eval_data.to_csv("model_output/eval_data.csv", index=False)

In [None]:
eval_data["Net_result_app_1"] = eval_data["Result_app_1"].apply(lambda x: x["label"])

In [None]:
eval_data["Net_result_app_1"].value_counts()

In [None]:
eval_data.columns

In [None]:
submission_app_1 = eval_data[["ID", "Net_result_app_1"]]

In [None]:
submission_app_1.rename(columns={"Net_result_app_1": "EventType"}, inplace=True)
submission_app_1["EventType"] = submission_app_1["EventType"].astype(float)
submission_app_1

In [None]:
submission_app_1.to_csv("model_output/submissions/sub_1/submission.csv", index=False)

## Second Approach

Here we use the finetuned model in order to create an embedding of each tweet, than we ensemble the embeddings of all the tweets in a period, so to create a "period embedding" and lastly we infer such embedding in a classifier, that will be:

- A. The Classifier Head of the finetuned distilbert
- B. A simple Logistic Classifier
- C. A simple SVM
- D. An XGBoost model


### A. Applying an ensemble embedding to the classifier head of the finetuned distilbert

In [None]:
import pandas as pd
from tqdm import tqdm
import ast

eval_data = pd.read_csv("model_output/submissions/sub_1/eval_data.csv")
eval_data['Tweet'] = eval_data['Tweet'].apply(ast.literal_eval)

In [None]:
from utils.finetuned_embedding import get_pre_classifier_output

tqdm.pandas()
eval_data["Embeddings_app_2"] = eval_data["Tweet"].progress_apply(get_pre_classifier_output)

In [None]:
eval_data.to_csv("model_output/submissions/sub_2/eval_data.csv", index=False)