In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore')



mapping = {"NEG":0, "NEU":1, "POS":2}

In [2]:
def parse_embedding(embedding_str):
        embedding_str = embedding_str.strip("[]")
        return np.array(embedding_str.split(), dtype=np.float32)

In [3]:
train_df = pd.read_csv("../data/train_paraphrased.csv")
val_df = pd.read_csv("../data/val_paraphrased.csv")
test_df = pd.read_csv("../data/test_paraphrased.csv")

### Using embeddings for the model to train

In [4]:
train_df["embeddings_ingles"] = train_df["embeddings_ingles"].apply(lambda x: parse_embedding(x))
val_df["embeddings_ingles"] = val_df["embeddings_ingles"].apply(lambda x: parse_embedding(x))
test_df["embeddings_ingles"] = test_df["embeddings_ingles"].apply(lambda x: parse_embedding(x))

In [5]:
X_train = np.vstack(train_df["embeddings_ingles"].values)
Y_train = train_df["label"].values


### RandomForest

In [7]:
model = RandomForestClassifier(random_state=42)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400],           # Number of trees
    'max_depth': [10, 20, 30],           # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],           # Minimum number of samples to split an internal node
    'min_samples_leaf': [1, 2, 4],             # Minimum number of samples at a leaf node
}

# Setup the grid search
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='f1_macro',
                           verbose=1)

# Fit the model to your training data
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [8]:
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters found: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation score: 0.3393918858874368


In [9]:
best_model = grid_search.best_estimator_

In [10]:
y_pred_train = best_model.predict(X_train)

In [11]:
print(classification_report(Y_train, y_pred_train, target_names=mapping.keys()))

              precision    recall  f1-score   support

         NEG       1.00      1.00      1.00      1371
         NEU       0.98      1.00      0.99      2220
         POS       1.00      0.90      0.95       443

    accuracy                           0.99      4034
   macro avg       0.99      0.97      0.98      4034
weighted avg       0.99      0.99      0.99      4034



#### Val Results

In [18]:
X_val = np.vstack(val_df["embeddings_ingles"].values)
Y_val = val_df["label"].values

In [20]:
y_pred = best_model.predict(X_val)
print(classification_report(Y_val, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

         NEG       0.62      0.20      0.30       196
         NEU       0.58      0.93      0.71       318
         POS       0.00      0.00      0.00        63

    accuracy                           0.58       577
   macro avg       0.40      0.38      0.34       577
weighted avg       0.53      0.58      0.50       577



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Test Results

In [21]:
X_test = np.vstack(test_df["embeddings_ingles"].values)
Y_test = test_df["label"].values

In [22]:
y_pred = best_model.predict(X_test)
print(classification_report(Y_test, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

         NEG       0.63      0.18      0.27       392
         NEU       0.58      0.95      0.72       634
         POS       0.00      0.00      0.00       127

    accuracy                           0.58      1153
   macro avg       0.40      0.37      0.33      1153
weighted avg       0.53      0.58      0.49      1153



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### SVM

In [28]:
model = SVC(random_state=42)

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],                     # Regularization parameter
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],  # Kernel coefficient for ‘rbf’, ‘poly’, and ‘sigmoid’
    'kernel': ['linear', 'rbf', 'poly']         # Specifies the kernel type
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy',
                           verbose=1)

# Fit the model to your training data
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [29]:
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters found: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation score: 0.5984155389719606


In [30]:
best_model = grid_search.best_estimator_

#### Val Results

In [31]:
X_val = np.vstack(val_df["embeddings_ingles"].values)
Y_val = val_df["label"].values

In [32]:
y_pred = best_model.predict(X_val)
print(classification_report(Y_val, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

         NEG       0.57      0.48      0.52       196
         NEU       0.63      0.81      0.71       318
         POS       0.00      0.00      0.00        63

    accuracy                           0.61       577
   macro avg       0.40      0.43      0.41       577
weighted avg       0.54      0.61      0.57       577



#### Test Results

In [33]:
X_test = np.vstack(test_df["embeddings_ingles"].values)
Y_test = test_df["label"].values

In [34]:
y_pred = best_model.predict(X_test)
print(classification_report(Y_test, y_pred, target_names=mapping.keys()))

              precision    recall  f1-score   support

         NEG       0.54      0.42      0.47       392
         NEU       0.62      0.82      0.70       634
         POS       0.00      0.00      0.00       127

    accuracy                           0.59      1153
   macro avg       0.38      0.41      0.39      1153
weighted avg       0.52      0.59      0.55      1153

