In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [2]:
df = pd.read_csv('Dataset Twitter Fix - Indonesian Sentiment Twitter Dataset Labeled (1).csv')
df = df[['sentimen', 'Tweet']]
df = df.dropna()
df['sentimen'] = df['sentimen'].astype(int)


In [3]:
# Modifieng Dataset
df = df[df['sentimen'] != 3]
df = df[df['sentimen'] != 4]
df = df[df['sentimen'] != 5]
df = df[df['sentimen'] != 6]
df = df[df['sentimen'] != 0]
df['sentimen'][df['sentimen'] == 2] = 0



In [4]:
df

Unnamed: 0,sentimen,Tweet
5327,1,doa rezeki tak putus inna haa zaa larizquna ma...
5328,1,makasih loh ntar kita bagi hasil aku 99 9 sisa...
5329,1,ya aku akan menjadi satu satunya bukan nomor s...
5330,1,i dont know why but these zikir sangat membant...
5331,1,aah kamu aja mas aku lebih suka diayomi
...,...,...
12602,0,tp gue gak ngerti sh maksudnya apa wkwk
12603,0,aku ingatkan gagal tadbir negara dengan elok j...
12604,0,kalo kamu lagi dimana mana dan itu bukan daera...
12606,0,biasalah tu perempuan kena berjaga jaga muka m...


In [5]:
df

Unnamed: 0,sentimen,Tweet
5327,1,doa rezeki tak putus inna haa zaa larizquna ma...
5328,1,makasih loh ntar kita bagi hasil aku 99 9 sisa...
5329,1,ya aku akan menjadi satu satunya bukan nomor s...
5330,1,i dont know why but these zikir sangat membant...
5331,1,aah kamu aja mas aku lebih suka diayomi
...,...,...
12602,0,tp gue gak ngerti sh maksudnya apa wkwk
12603,0,aku ingatkan gagal tadbir negara dengan elok j...
12604,0,kalo kamu lagi dimana mana dan itu bukan daera...
12606,0,biasalah tu perempuan kena berjaga jaga muka m...


In [6]:
df['sentimen'].value_counts()

0    4188
1    2792
Name: sentimen, dtype: int64

In [7]:
# Stop Words dari Sastrawi

stop_word_nya = StopWordRemoverFactory().get_stop_words()
print(stop_word_nya)

['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah', 'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin', 'juga', 'nggak', 'mari', 'nanti', 'melainkan', '

In [8]:
def preprocess_text(review):
    review = re.sub(r"http\S+", "", review)             # menghapus link website
    review = re.sub("\S*\d\S*", "", review).strip()     # proses menghapus nomor
    review = re.sub('[^A-Za-z]+', ' ', review)          # proses menghapus selain huruf
    review = review.translate(str.maketrans("","",string.punctuation)) #menghapus tanda baca
    review = review.lower()                             # proses case folding
    review = [word for word in review.split(" ") if not word in stop_word_nya] # proses stop word
    review = " ".join(review)
    review.strip()
    return review

df['Tweet'] = df['Tweet'].apply(lambda x: preprocess_text(x))

In [9]:
df['Tweet']

5327     doa rezeki tak putus inna haa zaa larizquna ma...
5328          makasih loh ntar hasil aku sisanya buat kamu
5329     aku menjadi satu satunya bukan nomor satu satu...
5330     i dont know why but these zikir sangat membant...
5331               aah kamu aja mas aku lebih suka diayomi
                               ...                        
12602              tp gue gak ngerti sh maksudnya apa wkwk
12603    aku ingatkan gagal tadbir negara elok je salah...
12604    kalo kamu mana bukan daerahmu coba buat ngehar...
12606    biasalah tu perempuan kena berjaga jaga muka m...
12607    terkadang kamu perlu membentang jarak tau arti...
Name: Tweet, Length: 6980, dtype: object

In [10]:
# Vectorize 'Text Tweet'
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Tweet'])

In [11]:
# Take 'Sentiment' as y
y = df['sentimen']

In [12]:
# Split the data into training and testing sets 9:1


for i in range(1, 10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1*i, random_state=0)
    model = KNeighborsClassifier(n_neighbors=11)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print(i*0.1)

    

Accuracy:  0.7564469914040115
0.1
Accuracy:  0.75
0.2
Accuracy:  0.7360381861575179
0.30000000000000004
Accuracy:  0.7295845272206304
0.4
Accuracy:  0.7429799426934097
0.5
Accuracy:  0.7340654094055861
0.6000000000000001
Accuracy:  0.7104563126662574
0.7000000000000001
Accuracy:  0.703080229226361
0.8
Accuracy:  0.6787647246099968
0.9
