In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the dataset
df = pd.read_csv('Combined_Brands_Dataset_TB.csv')

# Separate English and Indonesian datasets
english_df = df[df['Language'] == 'en']
indonesian_df = df[df['Language'] == 'id']

# Split data into training and testing sets
X_train_en, X_test_en, y_train_en, y_test_en = train_test_split(english_df['normalized_text'], english_df['Classification'],
                                                                test_size=0.2, random_state=42)
X_train_id, X_test_id, y_train_id, y_test_id = train_test_split(indonesian_df['normalized_text'], indonesian_df['Classification'],
                                                                test_size=0.2, random_state=42)



In [9]:
# Define a pipeline for text classification
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Define parameters for GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__stop_words': [None, 'english'],
}

# Perform GridSearchCV for English dataset
grid_search_en = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search_en.fit(X_train_en, y_train_en)

# Get the best Naive Bayes model for English dataset
best_nb_en = grid_search_en.best_estimator_

# Perform GridSearchCV for Indonesian dataset
grid_search_id = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search_id.fit(X_train_id, y_train_id)

# Get the best Naive Bayes model for Indonesian dataset
best_nb_id = grid_search_id.best_estimator_

# Replace the lines where you assign values to 'NaiveBayesSentiment' columns with these lines:
english_df.loc[:, 'NaiveBayesSentiment'] = best_nb_en.predict(english_df['normalized_text'])
indonesian_df.loc[:, 'NaiveBayesSentiment'] = best_nb_id.predict(indonesian_df['normalized_text'])


# Concatenate the English and Indonesian DataFrames
result_df = pd.concat([english_df, indonesian_df])

# Save the result DataFrame to a new CSV file
result_df.to_csv('Combined_Brands_Dataset_NB.csv', index=False)


In [10]:
# Print classification reports
print("Classification Report for English Dataset:")
print(classification_report(english_df['Classification'], english_df['NaiveBayesSentiment']))

print("Classification Report for Indonesian Dataset:")
print(classification_report(indonesian_df['Classification'], indonesian_df['NaiveBayesSentiment']))

Classification Report for English Dataset:
              precision    recall  f1-score   support

    Negative       0.70      0.72      0.71      9421
     Neutral       0.89      0.63      0.74     13831
    Positive       0.83      0.93      0.88     33025

    accuracy                           0.82     56277
   macro avg       0.81      0.76      0.78     56277
weighted avg       0.82      0.82      0.81     56277

Classification Report for Indonesian Dataset:
              precision    recall  f1-score   support

    Negative       0.99      0.72      0.84       216
     Neutral       0.95      0.98      0.97      2060
    Positive       0.94      0.92      0.93       801

    accuracy                           0.95      3077
   macro avg       0.96      0.88      0.91      3077
weighted avg       0.95      0.95      0.95      3077



In [6]:
from sklearn.metrics import confusion_matrix

# Confusion matrix for English dataset
conf_matrix_en = confusion_matrix(english_df['Classification'], english_df['NaiveBayesSentiment'])
print("Confusion Matrix for English Dataset:")
print(conf_matrix_en)

# Confusion matrix for Indonesian dataset
conf_matrix_id = confusion_matrix(indonesian_df['Classification'], indonesian_df['NaiveBayesSentiment'])
print("\nConfusion Matrix for Indonesian Dataset:")
print(conf_matrix_id)



Confusion Matrix for English Dataset:
[[ 6787   468  2166]
 [ 1031  8695  4105]
 [ 1815   564 30646]]

Confusion Matrix for Indonesian Dataset:
[[ 156   41   19]
 [   1 2028   31]
 [   0   61  740]]


In [7]:
# Count the values of NaiveBayesSentiment for English dataset
sentiment_counts_en = english_df['NaiveBayesSentiment'].value_counts()
print("\nValue Counts for NaiveBayesSentiment in English Dataset:")
print(sentiment_counts_en)

# Count the values of NaiveBayesSentiment for Indonesian dataset
sentiment_counts_id = indonesian_df['NaiveBayesSentiment'].value_counts()
print("\nValue Counts for NaiveBayesSentiment in Indonesian Dataset:")
print(sentiment_counts_id)



Value Counts for NaiveBayesSentiment in English Dataset:
NaiveBayesSentiment
Positive    36917
Neutral      9727
Negative     9633
Name: count, dtype: int64

Value Counts for NaiveBayesSentiment in Indonesian Dataset:
NaiveBayesSentiment
Neutral     2130
Positive     790
Negative     157
Name: count, dtype: int64
