# Baseline Model and Feature Selection

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from itertools import combinations

In [11]:
# Load the cleaned emotion data
df_emosi = pd.read_csv(r'src/cleaned_emotion_2.csv')

In [12]:
df_emosi.dropna(inplace=True)

In [13]:
# Define features and labels
X = df_emosi['tweet']
y = df_emosi['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Define the feature extraction methods
vectorizers = {
    'bow': CountVectorizer(),
    'tfidf': TfidfVectorizer(),
    'ngram': CountVectorizer(ngram_range=(1, 2))
}

In [15]:
# Define the classifiers
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

In [16]:
# Create a dictionary to store the results
results = []

In [17]:
# Define a function to train and evaluate the models with combinations of features
def evaluate_model(name, model, vectorizer_combinations):
    for vectorizer_names in vectorizer_combinations:
        features = [vectorizers[name] for name in vectorizer_names]
        combined_features = FeatureUnion([(name, vectorizer) for name, vectorizer in zip(vectorizer_names, features)])
        pipeline = Pipeline([
            ("features", combined_features),
            ("classifier", model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        results.append({
            'Model': name,
            'Features': '+'.join(vectorizer_names),
            'Accuracy': report['accuracy'],
            'Precision': report['macro avg']['precision'],
            'Recall': report['macro avg']['recall'],
            'F1-Score': report['macro avg']['f1-score']
        })


In [18]:
# Evaluate models with different feature combinations
for model_name, model in models.items():
    for r in range(1, len(vectorizers) + 1):
        for vectorizer_combinations in combinations(vectorizers.keys(), r):
            evaluate_model(model_name, model, [vectorizer_combinations])

In [19]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)


In [20]:
# Display the results
results_df.sort_values(by='F1-Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Model,Features,Accuracy,Precision,Recall,F1-Score
0,Random Forest,tfidf,0.730932,0.744781,0.738857,0.738726
1,Random Forest,bow+tfidf,0.721045,0.72957,0.737514,0.72973
2,Random Forest,tfidf+ngram,0.719633,0.734323,0.729049,0.727664
3,SVM,tfidf+ngram,0.716808,0.732096,0.727495,0.726025
4,Random Forest,bow+tfidf+ngram,0.717514,0.727438,0.733696,0.725697
5,SVM,bow+tfidf+ngram,0.714689,0.731998,0.724362,0.724653
6,SVM,ngram,0.713983,0.730136,0.725553,0.724053
7,SVM,bow+ngram,0.711864,0.729606,0.72194,0.722193
8,SVM,tfidf,0.713983,0.743755,0.712341,0.721433
9,SVM,bow+tfidf,0.708333,0.735829,0.712238,0.719273


# Hyperparameter tuning in Random Forest

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid, cross_val_score
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from tqdm.notebook import tqdm
import warnings

# Define the feature extraction steps
vectorizer_bow = CountVectorizer()
vectorizer_tfidf = TfidfVectorizer()

# Combine the features using FeatureUnion
combined_features = FeatureUnion([
    ("bow", vectorizer_bow),
    ("tfidf", vectorizer_tfidf)
])

# Create a pipeline that first transforms the data and then applies the model
pipeline = Pipeline([
    ("features", combined_features),
    ("clf", RandomForestClassifier())
])

param_grid = {
     "clf__n_estimators": [100, 200, 500],
    "clf__max_depth": [None, 10, 20],
    "clf__max_features": ['sqrt', 'log2', 0.2, 0.5, 0.8],
    # "clf__min_samples_split": [2, 5, 10],
    # "clf__min_samples_leaf": [1, 2, 4]
}

# Create parameter grid
param_grid = list(ParameterGrid(param_grid))

# Create the progress bar
progress_bar = tqdm(total=len(param_grid))

# Variable to keep track of the best score and best parameters
best_score = -np.inf
best_params = None

# Manually iterate over the parameter grid
for params in param_grid:
    pipeline.set_params(**params)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        scores = cross_val_score(pipeline, X_train, y_train, cv=3, n_jobs=-1)
    score = np.mean(scores)
    if score > best_score:
        best_score = score
        best_params = params
    progress_bar.update(1)

progress_bar.close()

print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")

# Set the best parameters and train the final model
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("RandomForestClassifier with Combined Features")
print(classification_report(y_test, y_pred))

  0%|          | 0/45 [00:00<?, ?it/s]

Best Score: 0.6998763469351705
Best Parameters: {'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__n_estimators': 100}
RandomForestClassifier with Combined Features
              precision    recall  f1-score   support

       Anger       0.72      0.74      0.73       233
        Fear       0.78      0.82      0.80       198
         Joy       0.79      0.69      0.74       240
        Love       0.66      0.86      0.75       161
     Neutral       0.68      0.65      0.67       394
         Sad       0.75      0.67      0.71       190

    accuracy                           0.72      1416
   macro avg       0.73      0.74      0.73      1416
weighted avg       0.73      0.72      0.72      1416

