In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from itertools import combinations

In [62]:
# Load the cleaned emotion data
df_emosi = pd.read_csv(r'src\cleaned_pemilu_labeled.csv')

In [63]:
df_emosi.dropna(subset=['label'], inplace=True)

In [64]:
df_emosi['label'].value_counts()

netral    297
senang    265
marah     152
takut      35
Name: label, dtype: int64

In [65]:
df_emosi = df_emosi[df_emosi['label'] != 'takut']
# df_emosi = df_emosi[df_emosi['label'] != 'netral']

In [66]:
# Define features and labels
X = df_emosi['full_text']
y = df_emosi['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [67]:
len(X_train), len(X_test)   

(571, 143)

In [68]:
# Define the feature extraction methods
vectorizers = {
    'bow': CountVectorizer(),
    'tfidf': TfidfVectorizer(),
    'ngram': CountVectorizer(ngram_range=(1, 2))
}

In [69]:
# Define the classifiers
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

In [70]:
# Create a dictionary to store the results
results = []

In [71]:
# Define a function to train and evaluate the models with combinations of features
def evaluate_model(name, model, vectorizer_combinations):
    for vectorizer_names in vectorizer_combinations:
        features = [vectorizers[name] for name in vectorizer_names]
        combined_features = FeatureUnion([(name, vectorizer) for name, vectorizer in zip(vectorizer_names, features)])
        pipeline = Pipeline([
            ("features", combined_features),
            ("classifier", model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        results.append({
            'Model': name,
            'Features': '+'.join(vectorizer_names),
            'Accuracy': report['accuracy'],
            'Precision': report['macro avg']['precision'],
            'Recall': report['macro avg']['recall'],
            'F1-Score': report['macro avg']['f1-score']
        })


In [72]:
# Evaluate models with different feature combinations
for model_name, model in models.items():
    for r in range(1, len(vectorizers) + 1):
        for vectorizer_combinations in combinations(vectorizers.keys(), r):
            evaluate_model(model_name, model, [vectorizer_combinations])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [73]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)


In [74]:
# Display the results
results_df.sort_values(by='F1-Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Model,Features,Accuracy,Precision,Recall,F1-Score
0,Naive Bayes,bow+ngram,0.475524,0.533911,0.41213,0.385775
1,Naive Bayes,ngram,0.468531,0.63846,0.406922,0.380787
2,Naive Bayes,bow,0.454545,0.417519,0.399978,0.373129
3,Random Forest,tfidf,0.468531,0.521849,0.393033,0.367878
4,Naive Bayes,bow+tfidf,0.468531,0.424564,0.40485,0.367647
5,Naive Bayes,tfidf+ngram,0.468531,0.640625,0.401378,0.364646
6,Random Forest,bow+tfidf,0.475524,0.422591,0.397905,0.363643
7,Naive Bayes,tfidf,0.48951,0.323177,0.411458,0.361918
8,Naive Bayes,bow+tfidf+ngram,0.461538,0.471795,0.396169,0.360446
9,SVM,ngram,0.503497,0.350508,0.399306,0.340308
