<div class="alert alert-block alert-success">

# **1.** **Setup**

<div>

In [1]:
from utils import *

In [2]:
# Set random seeds for reproducibility
tf.random.set_seed(221)
random.seed(221)
np.random.seed(221)
tf.random.set_seed(221)

## **1.1** Datasets

In [3]:
# Load the train/val split data
with open('train_val_split.pkl', 'rb') as f:
    data = pickle.load(f)

# Store the data in variables
x_train = data['x_train']
x_val = data['x_val']
y_train = data['y_train']
y_val = data['y_val']

In [4]:
# For EXTRA

# Load the train/val split data without preprocessing
with open('train_val_split_no_preproc.pkl', 'rb') as f:
    data_no_preproc = pickle.load(f)

# Convert DataFrames to list
train_texts = data_no_preproc['x_train'].tolist()
val_texts = data_no_preproc['x_val'].tolist()

# Convert Series to list
train_labels = data_no_preproc['y_train'].tolist()
val_labels = data_no_preproc['y_val'].tolist()

In [5]:
with open("X_train_te3s_embeddings.pkl", "rb") as f:
    X_train_te3s = pickle.load(f)

with open("X_val_te3s_embeddings.pkl", "rb") as f:
    X_val_te3s = pickle.load(f)

In [6]:
with open("X_train_roberta_embeddings.pkl", "rb") as f:
    X_train_roberta = pickle.load(f)

with open("X_val_roberta_embeddings.pkl", "rb") as f:
    X_val_roberta = pickle.load(f)

## **1.3** General

In [6]:
corpus = x_train['text']

#get list with lenghts of sentences
train_len = []
for i in corpus:
    train_len.append(len(i))

vector_size = max(train_len)

metrics_df = []

In [7]:
tokenized_train = [word_tokenize(tweet.lower()) for tweet in x_train['text']]
max_seq_len = max(len(tokens) for tokens in tokenized_train)

## **1.4** Models

### Glove

In [8]:
model_name = 'glove-twitter'
glove_model = gensim.downloader.load(f'{model_name}-{emb_size}')

### Text Embeddings 3 Small

In [9]:
# EXTRA

# Load variables from .env into environment
load_dotenv()

# Print environment variable
print("AZURE_OPENAI_ENDPOINT:", os.getenv("AZURE_OPENAI_ENDPOINT"))

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_KEY"),
    api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Define embedding model
model = "text-embedding-3-small"

AZURE_OPENAI_ENDPOINT: https://novaimsplayground.openai.azure.com/


In [10]:
# Define embedding model text embedding
model_te3s = "text-embedding-3-small"

### Roberta

In [11]:
# Load the pre-trained model and tokenizer for roberta
model_name = "cardiffnlp/twitter-roberta-base"
tokenizer_roberta = AutoTokenizer.from_pretrained(model_name)
model_roberta = AutoModel.from_pretrained(model_name)

### LSTM

In [12]:
input_ = Input(shape=(max_seq_len, vector_size))

x = Masking(mask_value=0.0)(input_)
x = Bidirectional(LSTM(units=units, return_sequences=False, dropout=dropout, recurrent_dropout=dropout))(x)
x = Dropout(dropout)(x)
output = Dense(num_class, activation='softmax')(x)

<div class="alert alert-block alert-success">

# **2.** **Hyperparameter Tuning**

<div>

## **2.1** LR with RoBERTa

In [7]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [8]:
X_combined = np.concatenate([X_train_roberta, X_val_roberta])
y_combined = np.concatenate([y_train, y_val])
test_fold = [-1]*len(X_train_roberta) + [0]*len(X_val_roberta)
ps = PredefinedSplit(test_fold)


In [9]:
param_grid_lr_roberta = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"], 
    "solver": ["lbfgs", "saga"],
    "class_weight": ["balanced"],
    "max_iter": [ 200, 500], 
    "multi_class": ["multinomial"]
}


In [10]:
grid_lr_roberta = GridSearchCV(
    LogisticRegression(),
    param_grid=param_grid_lr_roberta,
    scoring='f1_macro',
    cv=ps,
    verbose=1,
    n_jobs=-1
)

In [11]:
print(len(X_train_roberta), len(y_train))
print(len(X_val_roberta), len(y_val))

7634 7630
1909 1909


In [12]:
grid_lr_roberta.fit(X_combined, y_combined)

ValueError: Found input variables with inconsistent numbers of samples: [9543, 9539]

## **2.2** LR with Text Embedding

In [None]:
param_grid_lr = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["elasticnet"], 
    "solver": ["saga"],
    "class_weight": ["balanced"],
    "max_iter": [200, 500]
}


## **2.3** XGBoost with Text Embedding 3 small

In [14]:
param_grid_xgb_text3 = {
    "n_estimators": [300, 500,1000],
    "learning_rate": [0.05, 0.1, 0.5, 0.7],
    "max_depth": [3, 5, 7],
    "min_child_weight": [1, 3],
    "gamma": [0, 0.2, 0.5],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0],
    "reg_alpha": [0, 0.5],
    "reg_lambda": [1.0, 2.0]
}


## **2.4** Random Forest Bow Bigrams

<div class="alert alert-block alert-success">

# **3.** **Final Predictions**

<div>

In [None]:
# Load the test dataset
test_data = pd.read_csv("../data/test.csv")
test_texts = test_data["text"].tolist()

# Get embeddings for test set
X_test_roberta = np.array(get_roberta_embeddings(train_texts, "X_test_roberta_embeddings.pkl", batch_size=32, force_reload=False))

In [None]:
# Predict labels with the trained classifier
X_test_te3s_pred = clf_roberta_lr.predict(X_test_te3s)

# Create submission DataFrame
submission_te3s = pd.DataFrame({
    "id": test_data["id"],
    "label": X_test_roberta_pred
})

# Save submission to CSV
submission_te3s.to_csv("roberta_lr_pred_25.csv", index=False)
print("Submission file saved as roberta_lr_pred_25.csv")

In [None]:
# Load the predictions from the saved CSV file
pred_25 = pd.read_csv("roberta_lr_pred_25.csv")
pred_25.head()