# Import Library

In [1]:
import pandas as pd
import os
import mglearn
import numpy as np
import matplotlib.pyplot as plt
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer

# Import Dataset

In [2]:
dataset = pd.read_csv('SentimentDatasetonAppReviewfromAppStore.txt' , delimiter= '\t', quoting = 1)

display(dataset.head())

Unnamed: 0,ID,TITLE,COMMENT,RATING,SENTIMEN
0,1611,Banyak kekurangan,"Tidak seperti di iphone menunya kurang, jadwal...",1,Negative
1,1612,Need update!,Kurang bersahabat ama VoiceOver nih. Semoga ha...,4,Negative
2,1613,Baru seperti ini sudah puas?,"Hampir setahun belum ada update, UI tidak ada ...",1,Negative
3,1614,Crash melulu,Mohon appnya diperbaiki.. Crashnya ga ketolong...,1,Negative
4,1615,Sampah,"Aplikasi sampah, crash mulu,errorny banyak,ga ...",1,Negative


In [3]:
dataset = dataset[['COMMENT','SENTIMEN']]
df = pd.DataFrame(dataset)

display(dataset.head())

Unnamed: 0,COMMENT,SENTIMEN
0,"Tidak seperti di iphone menunya kurang, jadwal...",Negative
1,Kurang bersahabat ama VoiceOver nih. Semoga ha...,Negative
2,"Hampir setahun belum ada update, UI tidak ada ...",Negative
3,Mohon appnya diperbaiki.. Crashnya ga ketolong...,Negative
4,"Aplikasi sampah, crash mulu,errorny banyak,ga ...",Negative


# Data Pre-processing

In [4]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

corpus = []
for i in range(0,553):
    comment = re.sub('[^a-zA-Z]', ' ',dataset['COMMENT'][i]) #selain a-z atau A-Z 
    comment = comment.lower() #lower
    comment = comment.split() #dibagi per array
    comment = [stemmer.stem(word) for word in comment if not word in set(stopwords.words('indonesian'))] #stemming
    comment = ' '.join(comment) # dijadiin satu kembali
    corpus.append(comment) #push ke corpus
print(corpus)

['iphone menu jadwal sepak bola tanggal baik', 'sahabat ama voiceover nih moga perhati update tambahin menu contact developer nya', 'tahun update ui kembang fitur tambah crash baik rate star uninstall', 'mohon appnya baik crashnya ga tolong', 'aplikasi sampah crash mulu errorny ga dibenerin', 'muat berita yg bobot butuh user interface yg murah selaras', 'crash melulu kerja fix this please', 'berita potong ipad iphone blackberry browsing aplikasi', 'thanks for making this great ebook for indonesian keep it up detik', 'bagus sayang buggy gk ketulungan', 'thank ya udah yg versi ipad hehheheheheh', 'detikbola ga detik for ipad', 'asslamuallaikum aplikasi bagus banget manfaat yg baik contoh mendownload audio crash ukur huruf besar pinch mdh an depan baik bug tsb moga tim senantiasa sehat ikhlas buat yg manfaat ummat terima kasih wassalamulalikum', 'assalamuallaikum subhanallah aplikasi manfaat moga tajwid warna ajar tajwid mudah erti moga anggota tim terima amal ibadah terima kasih wassalla

# Pembagian data train dan test 

In [5]:
X = corpus
y = dataset.iloc[:, 1].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=0)

cv = CountVectorizer()
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test)

pd.DataFrame(X_train, columns = cv.get_feature_names())

Unnamed: 0,aamiin,abad,abai,abis,about,absolutly,abundant,accept,accepted,access,...,yaaa,yah,yay,years,yet,yg,yo,you,your,yufid
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
438,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Data Train Fitur Seleksi

In [6]:
def chisquare_train(X_train,y_train):
    y_binarized = LabelBinarizer().fit_transform(y_train)
    #print(y_binarized)
    #print()

    observed = np.dot(y_binarized.T, X_train)
    #print(observed)

    class_prob = y_binarized.mean(axis=0).reshape(1,-1)
    feature_count = X_train.sum(axis=0).reshape(1,-1)
    expected = np.dot(class_prob.T, feature_count)
    #print(expected)

    chisq = (observed - expected) ** 2 / expected
    chisq_score = chisq.sum(axis = 0)
    #print(chisq_score)

    #print(chisq_score.shape)
    #dof=1 alpha=0.5
    
    tampungan = []
    array=[]
    for i in range (0,1966):
        b = chisq_score[i]
        a = 3.84
        c = i
        if (b>=a):
            tampungan.append(b) #push ke tampungan
            array.append(i)
    #print(tampungan)
    #print(array)

    array = np.array(array)
    X_train = X_train.T
   # print(X_train.shape)

    X_train2 = []
    feature_seleksi = []
   
    for i in (array):
        X_train1 = X_train[i]
                #print(array[i])
                #print(j)
        X_train2.append(X_train1)
        
    #print(X_train2)

    X_train2= np.array(X_train2)
    #print(X_train2.shape)

    X_train2 = X_train2.T
    #print(X_train2.shape)
    #print(X_train2)
    return X_train2 , y_train , array

X_train2, y_train, selected_feature = chisquare_train(X_train,y_train)
print(X_train2)
print(selected_feature)

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[  17   35   37   38   58   95  110  114  115  116  126  131  132  136
  146  150  152  156  157  169  173  174  175  188  190  211  215  217
  221  224  250  258  263  268  273  285  291  293  297  298  304  314
  315  317  327  335  337  354  370  378  386  400  404  408  411  422
  426  430  445  446  451  460  461  473  474  495  504  513  517  520
  521  528  537  562  573  574  579  580  581  585  602  606  607  610
  614  623  626  635  638  642  646  674  681  693  705  708  709  717
  720  725  734  749  756  774  775  782  785  788  790  792  807  812
  814  815  823  837  838  842  854  876  881  883  896  900  904  908
  916  918  919  923  924  928  929  946  959  971  973  978  995 1004
 1007 1031 1039 1050 1051 1054 1067 1086 1089 1092 1106 1110 1117 1132
 1135 1137 1144 1145 1167 1183 1186 1188 1194 1196 1197 1198 1205 1210
 1211 1220 1228 1230 1257 12

In [7]:
print(X_train2)
print(X_train2.shape)

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(442, 275)


# Data Test Fitur Seleksi

In [8]:
def chisquare_test(X_test, selected_feature):
    from scipy.sparse import csr_matrix
    X_test = csr_matrix.toarray(X_test)
    #print(X_test)

    X_test = X_test.T
    #print(X_test.shape)

    X_test2 = []
    for j in (selected_feature):
        #if 
        X_test1 = X_test[j]
        #print(X_train1)
        #print(j)
        X_test2.append(X_test1)
       
    #print(X_test2)

    X_test2= np.array(X_test2)
    #print(X_test2.shape)

    X_test2 = X_test2.T
    #print(X_test2.shape)
    #print(X_test2)
    return X_test2

X_test2 = chisquare_test(X_test, selected_feature)
print(X_test2)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Classifier with GridSearchCV

In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True) 
  
# fitting the model for grid search 
grid.fit(X_train2, y_train) 



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
# print best parameter after tuning 
print(grid.best_params_)

grid_predictions = grid.predict(X_test2) 

grid_predictions_train = grid.predict(X_train2)
  
# print classification report 
print(classification_report(y_test, grid_predictions))
print(" ")
print(classification_report(y_train, grid_predictions_train))

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
              precision    recall  f1-score   support

    Negative       0.67      0.80      0.73        40
     Neutral       0.00      0.00      0.00         9
    Positive       0.81      0.82      0.82        62

   micro avg       0.75      0.75      0.75       111
   macro avg       0.49      0.54      0.51       111
weighted avg       0.69      0.75      0.72       111

 
              precision    recall  f1-score   support

    Negative       0.87      0.87      0.87       201
     Neutral       1.00      0.41      0.58        44
    Positive       0.83      0.94      0.88       197

   micro avg       0.86      0.86      0.86       442
   macro avg       0.90      0.74      0.78       442
weighted avg       0.87      0.86      0.85       442



  'precision', 'predicted', average, warn_for)


In [11]:
from sklearn.metrics import confusion_matrix
cm3 = confusion_matrix(y_test, grid_predictions)
cm4 = confusion_matrix(y_train, grid_predictions_train)
print(cm3)
print(cm4)

[[32  0  8]
 [ 5  0  4]
 [11  0 51]]
[[175   0  26]
 [ 14  18  12]
 [ 12   0 185]]


In [12]:
print("Score Test Data: {:.2f}".format(grid.score(X_test2,y_test)))
print("Score Train Data: {:.2f}".format(grid.score(X_train2,y_train)))

Score Test Data: 0.75
Score Train Data: 0.86


# Testing data baru

In [13]:
#masuk data baru
percobaan2 = "aplikasi ini bagus juga ya untuk di pakai ramai ramai jadi lebih efektif"

#preprocessing 
percobaan = []
comment = re.sub('[^a-zA-Z]', ' ',percobaan2) #selain a-z atau A-Z 
comment = comment.lower() #lower
comment = comment.split() #dibagi per array
comment = [stemmer.stem(word) for word in comment if not word in set(stopwords.words('indonesian'))] #stemming
comment = ' '.join(comment) # dijadiin satu kembali
percobaan.append(comment) #push ke corpus

#ekstraksi fitur
test1 = cv.transform(percobaan)

#fitur seleksi
test2 = chisquare_test(test1, selected_feature)

#prediksi kata
test3 = grid.predict(test2)
print(test3)

['Positive']


In [14]:
hai = "saya tidak suka app ini tolong diperbaiki benar benar dong saya jadi tidak bisa pakai ini"
#print(hai)

percobaan = []
comment = re.sub('[^a-zA-Z]', ' ',hai) #selain a-z atau A-Z 
comment = comment.lower() #lower
comment = comment.split() #dibagi per array
comment = [stemmer.stem(word) for word in comment if not word in set(stopwords.words('indonesian'))] #stemming
comment = ' '.join(comment) # dijadiin satu kembali
#    #comment = stemmer.stem(comment)
percobaan.append(comment) #push ke corpus
#print(percobaan)

test = cv.transform(percobaan)
#print(test)
#type(test)

test1 = chisquare_test(test, selected_feature)

#print(test1)
#print(test1.shape)

test2 = grid.predict(test1)
print(test2)

['Negative']
