# Sentiment Analysis With Multinomial Naive Bayes



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 1: import required libs

In [2]:
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

# Step 2: load the dataset

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/naive bayes/DATASET.csv')
df

Unnamed: 0,tweet,Sentimen
0,tahun ngecas mobil listrik menit cepat isi bensin,POSITIF
1,emang formula kenal karena dijadiin kompetisi ...,POSITIF
2,yaa allah paringana mobil listrik,POSITIF
3,ganti motor mobil listrik saja hemat bbm jg ku...,POSITIF
4,dukung eco green gibran mobil dinas pemkot sol...,POSITIF
...,...,...
5238,mosok seh suka orange pakai produk hasil salah...,POSITIF
5239,mending usul kurang keluar apbn mobil jabat de...,POSITIF
5240,ridwan kamil ketua asosiasi daerah hasil migas...,POSITIF
5241,tony hawk konversi mobil listrik pakai corvett...,POSITIF


# Step 3: drop the "NAN" values from dataset

In [4]:
df.dropna(inplace=True)

# Step 4:  drop "Neutral" labeled data if there exists

In [5]:
df = df[df.Sentimen !="NETRAL"]

# Step 5:  inspect the size of the "Positive" and "Negative" labeled data

In [6]:
df.groupby("Sentimen").nunique()

Unnamed: 0_level_0,tweet
Sentimen,Unnamed: 1_level_1
NEGATIF,2057
POSITIF,898


# Step 6:  to avoid overfitting, take equal size of samples from both classes

In [7]:
#  take 637 samples each of classes
df_pos = df[df.Sentimen =="POSITIF"].head(637)
df_neg = df[df.Sentimen =="NEGATIF"].head(637)
df_equ = df_pos.append(df_neg)
len(df_equ)

1274

# Step 7:  shuffle the new dataset

In [8]:
df = shuffle(df_equ)
df.head()

Unnamed: 0,tweet,Sentimen
2423,remuk loe tum ngomongin mobil listrik sumber p...,NEGATIF
349,serius garap mobil listrik saham indika indy d...,POSITIF
29,emang benar nabung electric vehicle motor ato ...,POSITIF
4082,bbm bebas bbm wuling airev mobil listrik nice,POSITIF
2809,instruksi pertalite masyarakat jabat negara pa...,NEGATIF


# Step 8:  drop stopwords, punctuations and lower the sentences

In [9]:
def drop_stopwords(raw_text):
    clean_data = [] 
    for text in raw_text:
        tokens = nltk.word_tokenize(text)
        tokens = [w.lower() for w in tokens]

        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]


        stop_words = set(stops)
        words = [w for w in words if not w in stop_words]
        clean_data.append(" ".join([w for w in words]))
    
    return clean_data

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [18]:
with open('/content/drive/MyDrive/Colab Notebooks/naive bayes/stopwordsID.txt', 'rb') as sw:
        stops = [s.strip() for s in sw.readlines()]

In [19]:
## to avoid SettingWithCopyWarning, copy the original df as dfa and return the df
dfa = df.copy()
dfa["stemmed"] = drop_stopwords(dfa.tweet)
df = dfa.copy()
df.head()

Unnamed: 0,tweet,Sentimen,stemmed
2423,remuk loe tum ngomongin mobil listrik sumber p...,NEGATIF,remuk loe tum ngomongin mobil listrik sumber p...
349,serius garap mobil listrik saham indika indy d...,POSITIF,serius garap mobil listrik saham indika indy d...
29,emang benar nabung electric vehicle motor ato ...,POSITIF,emang benar nabung electric vehicle motor ato ...
4082,bbm bebas bbm wuling airev mobil listrik nice,POSITIF,bbm bebas bbm wuling airev mobil listrik nice
2809,instruksi pertalite masyarakat jabat negara pa...,NEGATIF,instruksi pertalite masyarakat jabat negara pa...


# Step 9:  drop unnecessary columns in the dataset

In [20]:
df = df.drop(['tweet'], axis=1)
df.head()

Unnamed: 0,Sentimen,stemmed
2423,NEGATIF,remuk loe tum ngomongin mobil listrik sumber p...
349,POSITIF,serius garap mobil listrik saham indika indy d...
29,POSITIF,emang benar nabung electric vehicle motor ato ...
4082,POSITIF,bbm bebas bbm wuling airev mobil listrik nice
2809,NEGATIF,instruksi pertalite masyarakat jabat negara pa...


# Step 10:  split train and test data

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df.stemmed, df.Sentimen, test_size=0.20, random_state = 42)
## shape of the training data
X_train.shape

(1019,)

# Step 11:  vectorize the dataset using "TfidfVectorizer"

In [22]:
vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1, 2))
vectorizer.fit(X_train)
#  get the trainig and test features by transforming vectorizer
training_features = vectorizer.transform(X_train)    
test_features = vectorizer.transform(X_test)
## shape of the vectorized training data
training_features.shape

(1019, 12139)

# Step 12:  apply "GridSearchCV" method for "MultinomialNB" classifier 

In [23]:
#  create an instance of "MultinomialNB" classifier 
mnb = MultinomialNB()
# give some tuned_parameters in order to find the best alpha hyperparameter
tuned_parameters = {
    'alpha': [1, 1e-1, 1e-2]
}
# create a scorer to compare the parameters
acc_scorer = make_scorer(accuracy_score)
# create an instance of "GridSearchCV" class and give parameters
grid_obj = GridSearchCV(mnb, tuned_parameters, cv=10, scoring=acc_scorer)
grid_obj = grid_obj.fit(training_features, y_train)
# set the model to the best combination of parameters
model = grid_obj.best_estimator_
# fit the best model for the dataset 
model.fit(training_features, y_train)

MultinomialNB(alpha=1)

# Step 13:  predict the test data

In [24]:
y_pred = model.predict(test_features)
y_pred.shape

(255,)

# Step 14:  observe the accuracy score

In [25]:
print("Accuracy_score on the dataset:{:.2f}".format(accuracy_score(y_test, y_pred)))

Accuracy_score on the dataset:0.82


# Step 15:  create a classification_report

In [26]:
target_names = df.Sentimen.unique()
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     NEGATIF       0.83      0.81      0.82       131
     POSITIF       0.80      0.82      0.81       124

    accuracy                           0.82       255
   macro avg       0.82      0.82      0.82       255
weighted avg       0.82      0.82      0.82       255



# Step 16:  observe the confusion matrix

In [27]:
confusion_matrix(y_test, y_pred)

array([[106,  25],
       [ 22, 102]])

# Step 17:  observe the prediction probabilities of the wrong predictions

In [28]:
pred_prob = model.predict_proba(test_features)
data = {'neg_ratio': pred_prob[:,0], 'pos_ratio': pred_prob[:,1],'pred': y_pred, 'real': y_test,'stemmed': X_test}
df_pred_prob = pd.DataFrame(data=data)
df_pred_prob = df_pred_prob[df_pred_prob.pred != df_pred_prob.real]
df_pred_prob.head()

Unnamed: 0,neg_ratio,pos_ratio,pred,real,stemmed
4221,0.745137,0.254863,NEGATIF,POSITIF,kendara dinas pakai mobil listrik penting
2747,0.263536,0.736464,POSITIF,NEGATIF,wkwkwk mobil listrik mba yaya trade mark stay ...
2788,0.387385,0.612615,POSITIF,NEGATIF,emng mobil listrik hemat harga mobil mahal bat...
297,0.509663,0.490337,NEGATIF,POSITIF,rugi gw beli harga segitu mending mobil listrik
4201,0.50111,0.49889,NEGATIF,POSITIF,lebih kurang mobil listrik hemat kantong harga...


# Step 18:  apply ten-fold cross validation

In [29]:
#  convert targets to numbers since cross_validate works with numbers
df["target_binary"] = df.Sentimen.replace("POSITIF",1).replace("NEGATIF",0)
scoring_list = ["f1_macro","precision_macro","recall_macro","accuracy"]
scores = cross_validate(model,vectorizer.transform(df.stemmed), df.target_binary, cv=10, scoring=scoring_list)
for scr in scoring_list:
    print(scr+":"+"{:.2f}".format(scores["test_"+scr].mean()))

f1_macro:0.81
precision_macro:0.82
recall_macro:0.81
accuracy:0.81
