# LIBRARY

In [1]:
import pandas as pd
import numpy as np
import re, joblib
import collections
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

# DATASET

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,label,tweet
0,anger,soal jalan jatibaru polisi tidak bisa gertak g...
1,anger,sesama cewek lho kayaknya harusnya bisa leb...
2,happy,kepingin gudeg mbarek bu hj amad foto dari go...
3,anger,jln jatibaru bagian dari wilayah tanah abang p...
4,happy,sharing pengalaman saja kemarin jam batalin...


# TWEET DAN LABEL DATASET

In [3]:
text = df['tweet']
Y = df['label'].values

# VEKTORISASI

## 1. TF-IDF

In [4]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(text)

## 2. BIGRAM

In [5]:
bigram = CountVectorizer(ngram_range=(2,2))
X_bigram = bigram.fit_transform(text)

# OVERSAMPLING

## 1. SMOTE TF-IDF

In [6]:
counter = Counter(Y)
print('Sebelum SMOTE :',counter)
# oversampling the train dataset using SMOTE
smt = SMOTE(random_state=0)
X_tfidf_sm, Y_tfidf_sm = smt.fit_resample(X_tfidf, Y)

counter = Counter(Y_tfidf_sm)
print('Setelah SMOTE :',counter)

Sebelum SMOTE : Counter({'anger': 1101, 'happy': 1017, 'sadness': 997, 'fear': 649, 'love': 637})
Setelah SMOTE : Counter({'anger': 1101, 'happy': 1101, 'sadness': 1101, 'love': 1101, 'fear': 1101})


## 2. SMOTE BIGRAM

In [7]:
counter = Counter(Y)
print('Sebelum SMOTE :',counter)
# oversampling the train dataset using SMOTE
smt = SMOTE(random_state=0)
X_bigram_sm, Y_bigram_sm = smt.fit_resample(X_bigram, Y)

counter = Counter(Y_bigram_sm)
print('Sebelum SMOTE :',counter)

Sebelum SMOTE : Counter({'anger': 1101, 'happy': 1017, 'sadness': 997, 'fear': 649, 'love': 637})
Sebelum SMOTE : Counter({'anger': 1101, 'happy': 1101, 'sadness': 1101, 'love': 1101, 'fear': 1101})


# PEMBAGIAN DATASET

## 1. TRAINING DAN TESTING (SMOTE TF-IDF)

In [8]:
X_train_tfidf, X_test_tfidf, Y_train_tfidf, Y_test_tfidf = train_test_split(X_tfidf_sm, Y_tfidf_sm, test_size=0.1, random_state=0, stratify = Y_tfidf_sm)
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(Y_train_tfidf.shape)
print(Y_test_tfidf.shape)

(4954, 18037)
(551, 18037)
(4954,)
(551,)


## 2. TRAINING DAN TESTING (SMOTE BIGRAM)

In [9]:
X_train_bigram, X_test_bigram, Y_train_bigram, Y_test_bigram = train_test_split(X_bigram_sm, Y_bigram_sm, test_size=0.1, random_state=0, stratify = Y_bigram_sm)
print(X_train_bigram.shape)
print(X_test_bigram.shape)
print(Y_train_bigram.shape)
print(Y_test_bigram.shape)

(4954, 86121)
(551, 86121)
(4954,)
(551,)


# GRID-SEARCH TIAP SKENARIO
### Algoritma : KNN dan SVM Kernel RBF

## 1. KNN (SMOTE + TF-IDF)
#### Hyperparameter berupa nilai N_Neighbor dengan range (1 hingga 45)

In [10]:
model1 = KNeighborsClassifier()

In [11]:
#inisialiasi kombinasi nilai hyperparameter untuk tuning
params_search1 = {'n_neighbors': list(range(1, 45 ))}
#melakukan tuning hyperparameter terhadap model
tune_params1 = GridSearchCV(model1, params_search1, cv=10, n_jobs=-1).fit(X_train_tfidf, Y_train_tfidf)
tune_params1
#menampilkan nilai hyperparameter terbaik
print('Hasil tuning hyperprameter :', tune_params1.best_params_) 

Hasil tuning hyperprameter : {'n_neighbors': 1}


## 2. SVM-RBF (SMOTE + TF-IDF)

In [12]:
model2 = SVC(kernel='rbf')

In [13]:
#inisialiasi kombinasi nilai hyperparameter untuk tuning
params_search2 = [{'gamma': [0.005, 0.05, 0.1, 0.5, 0.75], 'C': [0.5, 0.75, 1, 10, 100]}]
#melakukan tuning hyperparameter terhadap model
tune_params2 = GridSearchCV(model2, params_search2, cv=10, n_jobs=-1).fit(X_train_tfidf, Y_train_tfidf)
tune_params2
#menampilkan nilai hyperparameter terbaik
print('Hasil tuning hyperprameter :', tune_params2.best_params_) 

Hasil tuning hyperprameter : {'C': 10, 'gamma': 0.1}


## 3. KNN (SMOTE + BIGRAM)

In [14]:
model3 = KNeighborsClassifier()

In [15]:
#inisialiasi kombinasi nilai hyperparameter untuk tuning
params_search3 = {'n_neighbors': list(range(1, 45 ))}
#melakukan tuning hyperparameter terhadap model
tune_params3 = GridSearchCV(model3, params_search3, cv=10, n_jobs=-1).fit(X_train_bigram, Y_train_bigram)
tune_params3
#menampilkan nilai hyperparameter terbaik
print('Hasil tuning hyperprameter :', tune_params3.best_params_) 

Hasil tuning hyperprameter : {'n_neighbors': 1}


## 4. SVM-RBF (SMOTE + BIGRAM)

In [16]:
model4 = SVC(kernel='rbf')

In [17]:
#inisialiasi kombinasi nilai hyperparameter untuk tuning
params_search4 = [{'gamma': [0.005, 0.05, 0.1, 0.5, 0.75], 'C': [0.5, 0.75, 1, 10, 100]}]
#melakukan tuning hyperparameter terhadap model
tune_params4 = GridSearchCV(model4, params_search4, cv=10, n_jobs=-1).fit(X_train_bigram, Y_train_bigram)
tune_params4
#menampilkan nilai hyperparameter terbaik
print('Hasil tuning hyperprameter :', tune_params4.best_params_) 

Hasil tuning hyperprameter : {'C': 10, 'gamma': 0.005}
