In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from itertools import combinations

In [62]:
# Load the cleaned emotion data
df_emosi = pd.read_csv(r'src/cleaned_emotion.csv')

In [63]:
# Define features and labels
X = df_emosi['tweet']
y = df_emosi['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [64]:
# Define the feature extraction methods
vectorizers = {
    'bow': CountVectorizer(),
    'tfidf': TfidfVectorizer(),
    'ngram': CountVectorizer(ngram_range=(1, 2))
}

In [65]:
# Define the classifiers
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

In [66]:
# Create a dictionary to store the results
results = []

In [67]:
# Define a function to train and evaluate the models with combinations of features
def evaluate_model(name, model, vectorizer_combinations):
    for vectorizer_names in vectorizer_combinations:
        features = [vectorizers[name] for name in vectorizer_names]
        combined_features = FeatureUnion([(name, vectorizer) for name, vectorizer in zip(vectorizer_names, features)])
        pipeline = Pipeline([
            ("features", combined_features),
            ("classifier", model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        results.append({
            'Model': name,
            'Features': '+'.join(vectorizer_names),
            'Accuracy': report['accuracy'],
            'Precision': report['macro avg']['precision'],
            'Recall': report['macro avg']['recall'],
            'F1-Score': report['macro avg']['f1-score']
        })


In [68]:
# Evaluate models with different feature combinations
for model_name, model in models.items():
    for r in range(1, len(vectorizers) + 1):
        for vectorizer_combinations in combinations(vectorizers.keys(), r):
            evaluate_model(model_name, model, [vectorizer_combinations])

In [69]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)


In [70]:
# Display the results
results_df.sort_values(by='F1-Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Model,Features,Accuracy,Precision,Recall,F1-Score
0,Naive Bayes,ngram,0.674234,0.70383,0.667995,0.680426
1,Naive Bayes,bow+tfidf,0.675369,0.700821,0.668567,0.679687
2,Naive Bayes,bow+ngram,0.673099,0.698683,0.66891,0.679439
3,Naive Bayes,tfidf+ngram,0.671964,0.703486,0.66531,0.678579
4,Naive Bayes,bow,0.673099,0.684548,0.672454,0.676479
5,Naive Bayes,bow+tfidf+ngram,0.670829,0.697043,0.664868,0.675957
6,SVM,tfidf,0.652667,0.696266,0.643091,0.658787
7,Random Forest,bow+tfidf,0.629966,0.670739,0.641313,0.645127
8,Random Forest,tfidf+ngram,0.625426,0.666431,0.637684,0.638388
9,Random Forest,ngram,0.627696,0.677261,0.636306,0.637349
