# PENGUMPULAN DATA

In [None]:
pip install PySastrawi

In [None]:
import pandas as pd
import numpy as np
import joblib
import pickle
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm

import nltk
import string
import re
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
#Koneksi ke Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Read the dataset
df = pd.read_excel('/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Dataset/Sentimen Pengguna Twitter Pada Topik IKN.xlsx')
df['Tweet']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df

In [None]:
#Visualisasi Data
plt.hist(df.Sentimen)

# PRE-PROCESSING TEXT

In [None]:
#Cleansing Data
def cleansing(text):
    #Menghilangkan tanda kurung
    text = re.sub('\[.*?\]', '', text)
    #Menghilangkan Tanda Baca
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #Menghilangkan Special Karakter
    text = re.sub('\w*\d\w*', '', text)
    #Menghilangkan Single Character
    text = re.sub('\s+[a-zA-Z]\s+', '', text)
    #Menghilangkan Link
    text = re.sub(r"http\S+", "", text, flags=re.MULTILINE)
    #Menghilangkan Hashtag
    text = re.sub(r'\B#\S+','', text)
    #Mengganti Multiple Spaces ke Single Spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    #Menghilangkan Spaces di awal
    text = text.strip()
    return text
df['CLEANSING']= df['Tweet'].apply(lambda x: cleansing(x))

In [None]:
df.head()

In [None]:
df.to_excel("/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Hasil QE/Cleansing.xlsx")

In [None]:
#CaseFolding
def case_folding(text):
    text = text.lower()
    return text
df['CASEFOLDING']= df['CLEANSING'].apply(lambda x: case_folding(x))
df.head()

In [None]:
df

In [None]:
#Tokenization
def tokenization(text):
    text = re.split('\W+', text)
    return text
df['TOKENIZATION']= df['CASEFOLDING'].apply(lambda x: tokenization(x))
df.head()

In [None]:
# read txt stopword using pandas
txt_stopword = pd.read_excel("/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Kamus/kamus_stopword.xlsx", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
list_stopwords = txt_stopword['stopwords'].values.tolist()

# ---------------------------------------------------------------------------------------

# convert list to dictionary
list_stopwords = set(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['STOP REMOVAL'] = df['TOKENIZATION'].apply(stopwords_removal)

print(df['STOP REMOVAL'].head())

In [None]:
df.to_excel("/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Hasil QE/StopwordRemoval.xlsx")

In [None]:
#StopwordRemoval
final = pd.DataFrame(df, columns= ['Sentimen', 'STOP REMOVAL'])
final

In [None]:
pip install swifter

In [None]:
#Stemming
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in final['STOP REMOVAL']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

final['STEMMING'] = final['STOP REMOVAL'].apply(get_stemmed_term)

In [None]:
final.head()

In [None]:
kbba = pd.read_excel('/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Kamus/kamus_normalisasi.xlsx')

In [None]:
kbba

In [None]:
#Normalisasi

normalizad_word_dict = {}

for index, row in kbba.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

final['NORMALISASI'] = final['STEMMING'].apply(normalized_term)

final['NORMALISASI'].head(10)

In [None]:
def fit_normalisasi(text):
    text = np.array(text)
    text = ' '.join(text)

    return text
final['NORMALISASI'] = final['NORMALISASI'].apply(lambda x: fit_normalisasi(x))
final.head()

In [None]:
final.to_excel("/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Hasil QE/Normalisasi.xlsx")

In [None]:
#NORMALISASI NEGASI
def convert_negasi(text):
    text = re.sub("tidak ", 'tidak', text, flags=re.MULTILINE)
    text = re.sub("jangan ", 'jangan', text, flags=re.MULTILINE)
    text = re.sub("belum ", 'belum', text, flags=re.MULTILINE)
    text = re.sub("bukan ", 'bukan', text, flags=re.MULTILINE)
    text = re.sub("tanpa ", 'tanpa', text, flags=re.MULTILINE)
    text = re.sub("bukanlah ", 'bukanlah', text, flags=re.MULTILINE)
    text = re.sub("tak ", 'tak', text, flags=re.MULTILINE)
    text = re.sub("anti ", 'anti', text, flags=re.MULTILINE)
    return text
final['NORMALISASI']= final['NORMALISASI'].apply(lambda x: convert_negasi(x))
final

In [None]:
from wordcloud import WordCloud

negative_words =' '.join([text for text in final['NORMALISASI'][final['Sentimen'] == 'Negatif']])

wordcloud = WordCloud(width=800, height=500, random_state = 0, max_font_size = 110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Negative Words')
plt.show()

In [None]:
from wordcloud import WordCloud

positif_words =' '.join([text for text in final['NORMALISASI'][final['Sentimen'] == 'Positif']])

wordcloud = WordCloud(width=800, height=500, random_state = 0, max_font_size = 110).generate(positif_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('The Positif Words')
plt.show()

In [None]:
#Merubah Sentimen ke Polarity
#Konversi Sentimen Ke Polaritas
def convert(polarity):
    if polarity == 'Positif':
        return 1
    else:
        return -1

In [None]:
final['Polarity'] = final['Sentimen'].apply(convert)
final

In [None]:
final.to_excel("/content/drive/MyDrive/Machine Learning/Analisis Sentimen IKN/Hasil QE/Setelah Pre-Processing.xlsx")

In [None]:
#Split Data
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(final['NORMALISASI'],final['Polarity'], test_size=0.1, random_state=30)
#print(X_Test)

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

# PEMBOBOTAN TF-IDF

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(final['NORMALISASI'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
Train_X_Tfidf.shape

In [None]:
Train_Y.shape

In [None]:
print(Train_X_Tfidf)

In [None]:
print(Test_X_Tfidf)

# QUERY EXPANSION

In [None]:
import expansion
import xlrd

queryList = []
# we want to keep the list of unique queries
uniqueQuery = []

workbook = xlrd.open_workbook('data/m.xls')
sheet_names = workbook.sheet_names()
xl_sheet = workbook.sheet_by_name(sheet_names[0])

for row_idx in range(0, xl_sheet.nrows):    # Iterate through rows
        cell_id = xl_sheet.cell(row_idx, 0).value  # Get id cell
        cell_topic = xl_sheet.cell(row_idx, 2).value  # Get topic cell
        if str(cell_topic) not in uniqueQuery:
            queryList.append(str(cell_id) + "," + str(cell_topic))
            uniqueQuery.append(str(cell_topic))

# the list of queries that we want to expand
expansion.run(queryList)

# METODE KLASIFIKASI SVM

In [None]:
hyperparameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svm = SVC()
svm_tuned = GridSearchCV(svm, hyperparameters)

In [None]:
svm_tuned.fit(Train_X_Tfidf, Train_Y)

In [None]:
svm = SVC(C=1, kernel='linear', degree=3, gamma='scale')
svm.fit(Train_X_Tfidf,Train_Y)

In [None]:
filename = 'model_svm_IKN.pkl'
pickle.dump(svm, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(Test_X_Tfidf, Test_Y)
print(result)

# EVALUASI KERJA

In [None]:
# HASIL AKURASI, RECALL, PRECISSION, F1 SCORE

predict_test = svm.predict(Test_X_Tfidf)

print("SVM Accuracy Score = ", accuracy_score(predict_test, Test_Y)*100)
print("SVM Precision Score = ", recall_score(predict_test, Test_Y)*100)
print("SVM Recall Score = ", precision_score(predict_test, Test_Y)*100)
print("SVM f1 Score = ", f1_score(predict_test, Test_Y)*100)

In [None]:
report = classification_report(Test_Y, predict_test)
print(report) # print classification_report

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                         normalize=False,
                         title=None,
                         cmap=plt.cm.Blues):

    """
    This functions
    normalize=True
    """

    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    cm = confusion_matrix(y_true, y_pred)

    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        title = 'Confusion matrix, without normalization'

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)

    ax.set(xticks=np.arange(cm.shape[1]),
          yticks=np.arange(cm.shape[0]),
          xticklabels=classes, yticklabels=classes,
          title=title,
          ylabel='True label',
          xlabel='Predicted label')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

    fmt= '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                   ha="center", va="center",
                   color="white" if cm[i, j] > thresh else "black")
            fig.tight_layout()
            return ax

    np.set_printoptions(precision=2)

In [None]:
class_names = Test_Y
plot_confusion_matrix(Test_Y, predict_test, classes=class_names,
                     title='Confusion matrix, without normalization')
plt.show()

In [None]:
print('CONFUSION MATRIX')
print('[TP  FN]')
print('[FP  TN]')
confusion_matrix(Test_Y, predict_test, labels=[1, 0])

In [None]:
# HASIL AKURASI, RECALL, PRECISSION, F1 SCORE DENGAN CONFUSION MATRIX

TP = 87
FN = 6
FP = 0
TN = 107

accuracy = (TP+TN)/(TP+FN+FP+TN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = 2 *precision*recall/(precision+recall)

print("HASIL AKURASI, RECALL, PRECISION, F1 SCORE DENGAN CONFUSION MATRIX \n")

print("SVM Accuracy Score = ", accuracy*100 ,"%")
print("SVM Precision Score = ", precision*100,"%")
print("SVM Recall Score = ", recall*100,"%")
print("SVM f1 Score = ", f1*100,"%")

# EVALUASI VALIDASI K-FOLD CROSS VALIDATION

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
#10 FOLD
scores_test = cross_val_score(svm, Test_X_Tfidf, Test_Y, cv = 10)
print("Hasil Akurasi menggunakan 10 Fold Cross Validation \n")
for i in range(10):
    print("Akurasi dari SVM Iterasi ke -", i+1, "  : {0:2}".format(scores_test[i,]*100))
print("\n","Rata-Rata Akurasi dari SVM menggunakan Cross Validation :", scores_test.mean()*100)

In [None]:
#10 FOLD - Precission

scores_test = cross_val_score(svm_tuned, Test_X_Tfidf, Test_Y, cv = 10, scoring='precision')
scores_test
print("Hasil Presisi menggunakan 10 Fold Cross Validation \n")
for i in range(10):
    print("Presisi dari SVM Iterasi ke -", i+1, "  : {0:2}".format(scores_test[i,]*100))
print("\n","Rata-Rata Presisi dari SVM menggunakan Cross Validation :", scores_test.mean()*100)

In [None]:
#10 FOLD - Recall

scores_test = cross_val_score(svm_tuned, Test_X_Tfidf, Test_Y, cv = 10, scoring='recall')
scores_test
print("Hasil Recall menggunakan 10 Fold Cross Validation \n")
for i in range(10):
    print("Recall dari SVM Iterasi ke -", i+1, "  : {0:2}".format(scores_test[i,]*100))
print("\n","Rata-Rata Recall dari SVM menggunakan Cross Validation :", scores_test.mean()*100)

In [None]:
#10 FOLD - F1 SCORE

scores_test = cross_val_score(svm_tuned, Test_X_Tfidf, Test_Y, cv = 10, scoring='f1')
scores_test
print("Hasil F1 Score menggunakan 10 Fold Cross Validation \n")
for i in range(10):
    print("F1 Score dari SVM Iterasi ke -", i+1, "  : {0:2}".format(scores_test[i,]*100))
print("\n","Rata-Rata F1 Score dari SVM menggunakan Cross Validation :", scores_test.mean()*100)

In [None]:
def classify(tweet):
    pred = svm.predict(Tfidf_vect.transform([tweet]))
    if pred == 1:
        return "Positif"
    return "Negatif"

In [None]:
classify('IKN akan jadi representasi bangsa yang unggul')

In [None]:
classify('IKN memberikan dampak positif')

In [None]:
classify('proyek bengkak')

In [None]:
classify('IKN proyek gagal')

In [None]:
classify('mari kita dukung pembangunan IKN')

In [None]:
classify('IKN bangkitkan ekonomi Indonesia')

In [None]:
classify('IKN kota dunia untuk semua')

In [None]:
classify('menuju Indonesia maju dan berkembang')

In [None]:
classify('pusat ekonomi masa depan')

In [None]:
classify('forest city')

In [None]:
classify('konsep Green City')

In [None]:
classify('investor ragu tanam modal')

In [None]:
classify('IKN lebih penting dari entaskan kemiskinan')

In [None]:
classify('curiga IKN')

In [None]:
#STEMMING
# create stemmer
factory = StemmerFactory()
stemming = factory.create_stemmer()
# Using a Python list comprehension method to apply to all words in my_list
def stem(text):
    my_list = text
    stemmed_list = [stemming.stem(word) for word in my_list]
    return (stemmed_list)

In [None]:
def preprocess(text):
    text = cleansing(text)
    text = case_folding(text)
    text = tokenization(text)
    text = stopwords_removal(text)
    text = stem(text)
    text = normalized_term(text)
    text = fit_normalisasi(text)
    text = convert_negasi(text)
    return text

In [None]:
a= preprocess('Dukung pembangunan IKN di Kalimantan Timur https/twitter.com #dukungIKN')
a

In [None]:
classify(a)