### Importing data

In [2]:
import pandas as pd
import numpy as np

In [6]:
df_train = pd.read_pickle("pickle_train.pickle")
df_test = pd.read_pickle("pickle_test.pickle")

In [4]:
df_train["sentiment"] = df_train["sentiment"].map({"negative": 2, "neutral": 0, "positive": 1})
df_test["sentiment"] = df_test["sentiment"].map({"negative": 2, "neutral": 0, "positive": 1})

### Preprocessing

In [104]:
import spacy

#Preprocessing
nlp = spacy.load('de_core_news_lg')

def clean_text(text):

    text = nlp(text)
    
    final_sentence = []
    for token in text:
        if token.is_stop == False and token.is_punct == False:
            final_sentence.append(token.lemma_.lower())     

    return " ".join(final_sentence)


df_train['text_clean'] = df_train['text'].apply(clean_text)
df_test['text_clean'] = df_test['text'].apply(clean_text)

### Feature Extraction

In [105]:
from sklearn.feature_extraction.text import CountVectorizer

#CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.5, min_df=5, ngram_range=(1,2), max_features=2000)

X_train_count = count_vectorizer.fit_transform(df_train['text_clean'])
X_test_count = count_vectorizer.transform(df_test['text_clean'])

In [106]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=5, ngram_range=(1,2), max_features=2000)

X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['text_clean'])
X_test_tfidf = tfidf_vectorizer.transform(df_test['text_clean'])

In [None]:
import torch
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model
model = BertModel.from_pretrained("bert-base-german-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

# Define a function to generate sentence embeddings
def get_sentence_embeddings(sentence):
   
    input_ids = torch.tensor([tokenizer.encode(sentence, max_length=512, truncation=True)])
    if torch.cuda.is_available():
        input_ids = input_ids.to('cuda')
        model.to('cuda')
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]
    last_hidden_states = last_hidden_states.cpu().numpy()

    return last_hidden_states[0][0]

#Load embeddings from DataFrame as get_sentence_embeddings needs to be run on GPU
X_train_emb = np.array(df_train["embedding"].tolist())
X_test_emb = np.array(df_test["embedding"].tolist())


### Training a Classifier

In [167]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier()

from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()

from sklearn.svm import SVC
svm_model = SVC()

In [None]:
from sklearn.metrics import f1_score
list_of_models = [svm_model, xgb_model, mlp_model]
list_of_features = [(X_train_count, X_test_count), (X_train_tfidf, X_test_tfidf), (X_train_emb, X_test_emb)]

name_of_models = ["SVM", "XGBoost", "MLP"]
name_of_features = ["CountVectorizer", "TfidfVectorizer", "BERT"]

df_results = pd.DataFrame(columns=["model", "features", "f1_score"])

#Train and evaluate models
for model, name_of_model in zip(list_of_models, name_of_models):
    for features, name_of_feature in zip(list_of_features, name_of_features):
        model.fit(features[0], df_train["sentiment"])
        y_pred = model.predict(features[1])
        f1 = f1_score(df_test["sentiment"], y_pred, average="weighted")
        df_results = df_results.append({"model": name_of_model, "features": name_of_feature, "f1_score": f1}, ignore_index=True)

### Rule-Based Sentiment for comparison

In [169]:
from textblob_de import TextBlobDE
df_test["textblob"] = df_test["text"].apply(lambda x: TextBlobDE(x).sentiment[0])
df_test["textblob"] = df_test["textblob"].apply(lambda x: 0 if x == 0 else 1 if x > 0 else 2)

In [None]:
df_results = df_results.append({"model": "TextBlob", "features": "Rulebased", "f1_score": f1_score(df_test["sentiment"], df_test["textblob"], average="weighted")}, ignore_index=True)

### Visualize Results

In [179]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=4, specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}, {"type": "bar"}]])

fig.add_trace(go.Bar(x=df_results[df_results["model"] == "SVM"]["features"], y=df_results[df_results["model"] == "SVM"]["f1_score"], name="SVM"), 1, 1)
fig.add_trace(go.Bar(x=df_results[df_results["model"] == "XGBoost"]["features"], y=df_results[df_results["model"] == "XGBoost"]["f1_score"], name="XGBoost"), 1, 2)
fig.add_trace(go.Bar(x=df_results[df_results["model"] == "MLP"]["features"], y=df_results[df_results["model"] == "MLP"]["f1_score"], name="MLP"), 1, 3)
fig.add_trace(go.Bar(x=df_results[df_results["model"] == "TextBlob"]["features"], y=df_results[df_results["model"] == "TextBlob"]["f1_score"], name="TextBlob"), 1, 4)

#make the scale the same for all subplots
fig.update_yaxes(range=[0, 0.8], row=1, col=1)
fig.update_yaxes(range=[0, 0.8], row=1, col=2)
fig.update_yaxes(range=[0, 0.8], row=1, col=3)
fig.update_yaxes(range=[0, 0.8], row=1, col=4)

#make the title for each subplot
fig.update_layout(title_text="F1-Scores", title_x=0.5)
fig.update_xaxes(title_text="Support Vector Machine", row=1, col=1)
fig.update_xaxes(title_text="Gradient Boosting", row=1, col=2)
fig.update_xaxes(title_text="Multi Layer Perceptron", row=1, col=3)
fig.update_xaxes(title_text="Text-Blob", row=1, col=4)


fig.show()
fig.write_html("f1_score.html")