# **Sentiment Analysis**
## Melvin Kent Jonathan / 13521052
## IF5153 Natural Language Processing

In [135]:
pip install nltk scikit-learn pandas PySastrawi



In [136]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.metrics import classification_report

## NLP Tools Initialization

In [137]:
# create stemmer
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Download NLTK data for tokenization and stopword removal
nltk.download('punkt')
nltk.download('stopwords')

# Load Indonesian stopwords from NLTK
stop_words = set(stopwords.words('indonesian'))

# Initialize the CountVectorizer (Bag of Words model)
binary_vectorizer = CountVectorizer(binary=True) # binary as value
tf_vectorizer = CountVectorizer() # count as value
tfidf_vectorizer = TfidfVectorizer() # tfidf as value

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing
1. lowercase
2. tokenization
3. removing stop words and puctuation
4. stemming

In [138]:
# Define a function to preprocess the text
def preprocess_text(text):
  # 1. Convert text to lowercase
  text = text.lower()

  # 2. Tokenize the text
  tokens = word_tokenize(text)

  # 3. Remove stopwords and punctuation
  tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

  # 4. Stem the tokens
  tokens = [stemmer.stem(word) for word in tokens]

  # Rejoin the tokens into a single string
  processed_text = ' '.join(tokens)

  return processed_text

## Metrics Function

In [149]:
# Define a function to display the metrics
def metrics(y_test, y_pred):
  # Calculate metrics
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='macro')
  recall = recall_score(y_test, y_pred, average='macro')
  f1_score_value = f1_score(y_test, y_pred, average='macro')

  print(f'Accuracy  : {accuracy:.4f}')
  print(f'Precision : {precision:.4f}')
  print(f'Recall    : {recall:.4f}')
  print(f'F1-score  : {f1_score_value:.4f}')

  # Generate and display classification report
  report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'])
  print(report)

  # Generate confusion matrix
  confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)
  print("Confusion Matrix")
  print(confusion_matrix)

## Import Data for Training and Testing

In [140]:
headers = ['text', 'label']

# import train data
train = pd.read_csv("nlp_data/train_preprocess.tsv", sep='\t', names=headers)

# import test data
test = pd.read_csv("nlp_data/test_preprocess.tsv", sep='\t', names=headers)

# Preprocess the train data
train['preprocessed_text'] = train['text'].apply(preprocess_text)

# Preprocess the test data
test['preprocessed_text'] = test['text'].apply(preprocess_text)

In [141]:
train[['text', 'preprocessed_text']].head()

Unnamed: 0,text,preprocessed_text
0,warung ini dimiliki oleh pengusaha pabrik tahu...,warung milik usaha pabrik puluh kenal putih ba...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,mohon ulama lurus k212 mmbri hujjah partai diw...
2,lokasi strategis di jalan sumatera bandung . t...,lokasi strategis jalan sumatera bandung nya ny...
3,betapa bahagia nya diri ini saat unboxing pake...,betapa bahagia nya unboxing paket barang nya b...
4,duh . jadi mahasiswa jangan sombong dong . kas...,duh mahasiswa sombong kasih kartu kuning ajar ...


In [142]:
test[['text', 'preprocessed_text']].head()

Unnamed: 0,text,preprocessed_text
0,kemarin gue datang ke tempat makan baru yang a...,kemarin gue makan dago gue makan nya enak harg...
1,kayak nya sih gue tidak akan mau balik lagi ke...,kayak nya sih gue gila ya gue ngerti nya biar ...
2,"kalau dipikir-pikir , sebenarnya tidak ada yan...",pikir bangga jokowi nepatin janji kerja nya ci...
3,ini pertama kalinya gua ke bank buat ngurusin ...,kali gua bank ngurusin buat rekening nama nya ...
4,waktu sampai dengan gue pernah disuruh ibu lat...,gue suruh latih karate nya biar gue lawan jaha...


# Bag of Words using Binary, TF, TFxIDF

In [150]:
# Example preprocessed text data
train_preprocessed_texts = train['preprocessed_text']  # This column contains the preprocessed text
test_preprocessed_texts = test['preprocessed_text']

y_train = train['label']
y_test = test['label']

# Fit and transform the preprocessed text data to generate the Bag of Words matrix
## binary
X_train_binary = binary_vectorizer.fit_transform(train_preprocessed_texts)
X_test_binary = binary_vectorizer.transform(test_preprocessed_texts)

## TF
X_train_tf = tf_vectorizer.fit_transform(train_preprocessed_texts)
X_test_tf = tf_vectorizer.transform(test_preprocessed_texts)

## TFxIDF
X_train_tfidf = tfidf_vectorizer.fit_transform(train_preprocessed_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_preprocessed_texts)

## Train and Evaluate Function

In [151]:
def train_and_evaluate(classifier, X_train_binary, X_train_tf, X_train_tfidf, y_train, X_test_binary, X_test_tf, X_test_tfidf, y_test):
  # Train and make predictions
  ## Binary
  classifier.fit(X_train_binary, y_train)
  y_pred_binary = classifier.predict(X_test_binary)

  print("========================= Binary =========================")
  metrics(y_test, y_pred_binary)

  ## TF
  classifier.fit(X_train_tf, y_train)
  y_pred_tf = classifier.predict(X_test_tf)

  print("========================= TF =========================")
  metrics(y_test, y_pred_tf)

  ## TFxIDF
  classifier.fit(X_train_tfidf, y_train)
  y_pred_tfidf = classifier.predict(X_test_tfidf)

  print("========================= TFxIDF =========================")
  metrics(y_test, y_pred_tfidf)

# Logistic Regression

In [152]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
classifier = LogisticRegression(max_iter=500)

# Train and Evaluate
train_and_evaluate(classifier, X_train_binary, X_train_tf, X_train_tfidf, y_train, X_test_binary, X_test_tf, X_test_tfidf, y_test)

Accuracy  : 0.7560
Precision : 0.7597
Recall    : 0.7032
F1-score  : 0.7181
              precision    recall  f1-score   support

    Negative       0.73      0.88      0.80       204
     Neutral       0.76      0.48      0.59        88
    Positive       0.79      0.75      0.77       208

    accuracy                           0.76       500
   macro avg       0.76      0.70      0.72       500
weighted avg       0.76      0.76      0.75       500

Confusion Matrix
Predicted  negative  neutral  positive  All
Actual                                     
negative        179        4        21  204
neutral          24       42        22   88
positive         42        9       157  208
All             245       55       200  500
Accuracy  : 0.7440
Precision : 0.7325
Recall    : 0.6934
F1-score  : 0.7048
              precision    recall  f1-score   support

    Negative       0.72      0.85      0.78       204
     Neutral       0.69      0.48      0.56        88
    Positive       0.79

# Support Vector Machines (SVM)

In [153]:
from sklearn.svm import SVC

# Initialize the SVM classifier
classifier = SVC(kernel='linear')

# Train and Evaluate
train_and_evaluate(classifier, X_train_binary, X_train_tf, X_train_tfidf, y_train, X_test_binary, X_test_tf, X_test_tfidf, y_test)

Accuracy  : 0.7060
Precision : 0.6927
Recall    : 0.6669
F1-score  : 0.6755
              precision    recall  f1-score   support

    Negative       0.69      0.79      0.74       204
     Neutral       0.64      0.50      0.56        88
    Positive       0.75      0.71      0.73       208

    accuracy                           0.71       500
   macro avg       0.69      0.67      0.68       500
weighted avg       0.71      0.71      0.70       500

Confusion Matrix
Predicted  negative  neutral  positive  All
Actual                                     
negative        162       15        27  204
neutral          23       44        21   88
positive         51       10       147  208
All             236       69       195  500
Accuracy  : 0.7100
Precision : 0.6877
Recall    : 0.6723
F1-score  : 0.6776
              precision    recall  f1-score   support

    Negative       0.69      0.78      0.73       204
     Neutral       0.59      0.51      0.55        88
    Positive       0.78

# k-Nearest Neighbors (k-NN)

In [154]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the k-NN classifier
classifier = KNeighborsClassifier(n_neighbors=3)

# Train and Evaluate
train_and_evaluate(classifier, X_train_binary, X_train_tf, X_train_tfidf, y_train, X_test_binary, X_test_tf, X_test_tfidf, y_test)

Accuracy  : 0.5420
Precision : 0.5837
Recall    : 0.4523
F1-score  : 0.4329
              precision    recall  f1-score   support

    Negative       0.47      0.76      0.59       204
     Neutral       0.60      0.07      0.12        88
    Positive       0.68      0.52      0.59       208

    accuracy                           0.54       500
   macro avg       0.58      0.45      0.43       500
weighted avg       0.58      0.54      0.51       500

Confusion Matrix
Predicted  negative  neutral  positive  All
Actual                                     
negative        156        3        45  204
neutral          75        6         7   88
positive         98        1       109  208
All             329       10       161  500
Accuracy  : 0.5380
Precision : 0.5712
Recall    : 0.4689
F1-score  : 0.4654
              precision    recall  f1-score   support

    Negative       0.48      0.79      0.60       204
     Neutral       0.56      0.17      0.26        88
    Positive       0.68

# Naive Bayes

In [155]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes classifier
classifier = MultinomialNB()

# Train and Evaluate
train_and_evaluate(classifier, X_train_binary, X_train_tf, X_train_tfidf, y_train, X_test_binary, X_test_tf, X_test_tfidf, y_test)

Accuracy  : 0.6520
Precision : 0.6950
Recall    : 0.5851
F1-score  : 0.5917
              precision    recall  f1-score   support

    Negative       0.61      0.91      0.73       204
     Neutral       0.76      0.30      0.43        88
    Positive       0.71      0.55      0.62       208

    accuracy                           0.65       500
   macro avg       0.70      0.59      0.59       500
weighted avg       0.68      0.65      0.63       500

Confusion Matrix
Predicted  negative  neutral  positive  All
Actual                                     
negative        186        0        18  204
neutral          34       26        28   88
positive         86        8       114  208
All             306       34       160  500
Accuracy  : 0.6400
Precision : 0.6554
Recall    : 0.5842
F1-score  : 0.5874
              precision    recall  f1-score   support

    Negative       0.60      0.91      0.72       204
     Neutral       0.64      0.34      0.44        88
    Positive       0.73