In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import imblearn

In [2]:
from sklearn import model_selection
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

In [3]:
# data = pd.read_csv("data/data/1.2_data_cleaner_pure_2.csv")
data = pd.read_csv('data/data/1.4_data_clean_stemmed_2.csv')

In [4]:
data = data.drop(columns=['rating', 'word_count', 'avg_word'])

In [5]:
data.head()

Unnamed: 0,review,aspect_category,polarity
0,bantu kondisi pandemi tolong evaluasi dian oba...,pelayanan,negatif
1,konsultasi dokter bagus durasi cepat dokter do...,pelayanan,negatif
2,astaga obat beli obat harga masuk akal lipat u...,harga,negatif
3,terima kasih layan baik cepat kirim bantu jele...,pelayanan,positif
4,telepon nomor hadiah konfirmasi aplikasi hadia...,sistem,positif


In [6]:
catagories = data['aspect_category'].unique()
catagories , len(catagories)

(array(['pelayanan', 'harga', 'sistem', 'anekdot/serbaneka'], dtype=object), 4)

In [7]:
polarity = data['polarity'].unique()
polarity

array(['negatif', 'positif'], dtype=object)

In [8]:
X = data['review']
y = data['aspect_category']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [10]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(640,) (640,)
(160,) (160,)


In [11]:
X_train

264    baik tangan suka panik diagnose baik langsung ...
615                                          cepat bagus
329    mudah repot repot antre antre rumah sakit klin...
342                                          bagus layan
394    konsultasi bayar pakai gopay gagal konsultasi ...
                             ...                        
71     layan jelek bayar mengunanakan debit kartu ken...
106             kasih bintang semoga aplikasi layan baik
270                           praktis bantu terima kasih
435    aplikasi jelek cari obat driver gojek arah apo...
102    instal daftarin vaksin usia tangal tulis slot ...
Name: review, Length: 640, dtype: object

In [12]:
X_test

696                                    bantu mudah bagus
667                                              manfaat
63                       bagus konsultasi psikologi baik
533    puas bantu terimakasi aplikasi halodic dokter ...
66     bantu aplikasi lelah rumah bantu jelek aplikas...
                             ...                        
589                                         respon cepat
798                               mudah konsultasi rumah
744                         aplikasi bantu pandemi bagus
513               bagus aplikasi bantu layan cepat bagus
670                                     bagus cepat obat
Name: review, Length: 160, dtype: object

In [13]:
y_train

264            pelayanan
615    anekdot/serbaneka
329            pelayanan
342            pelayanan
394            pelayanan
             ...        
71             pelayanan
106            pelayanan
270    anekdot/serbaneka
435               sistem
102               sistem
Name: aspect_category, Length: 640, dtype: object

In [14]:
y_train

264            pelayanan
615    anekdot/serbaneka
329            pelayanan
342            pelayanan
394            pelayanan
             ...        
71             pelayanan
106            pelayanan
270    anekdot/serbaneka
435               sistem
102               sistem
Name: aspect_category, Length: 640, dtype: object

In [15]:
# Proses pembobotan atau vektorisasi
print("Creating tfidf..")
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize
vectorizer = TfidfVectorizer(analyzer = "word",
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = None, 
                             max_features = 5000) 

# Melakukan pengubahan data latih kedalam vektor fitur
%time train_data_features = vectorizer.fit_transform(X_train)

# Mengkonversi dari list ke array, untuk memudahkan 
# ke tahap klasifikasi
train_data_features = train_data_features.toarray()

Creating tfidf..
Wall time: 289 ms


In [16]:
train_data_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
# Sama seperti pada data latih, data tes diubah kedalam
# vektor fitur selanjutnya di konversi menjadi bentuk array
test_data_features = vectorizer.transform(X_test)
test_data_features = test_data_features.toarray()

In [18]:
test_data_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
print(train_data_features.shape)
print(test_data_features.shape)

(640, 810)
(160, 810)


In [20]:
from sklearn.metrics import accuracy_score, log_loss,confusion_matrix, roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [21]:
# Proses klasifikasi menggunakan ETC
from sklearn.ensemble import ExtraTreesClassifier

# Inisialisasi ETC
etc = ExtraTreesClassifier(criterion='entropy',class_weight='balanced_subsample', max_features=10)

# Memasang data latih ke classifier
etc.fit(train_data_features,y_train)

# Melakukan prediksi dengan membandingkan dengan data tes
y_pred_train = etc.predict(train_data_features)
y_pred_cat = etc.predict(test_data_features)

In [22]:
y_pred_train

array(['pelayanan', 'anekdot/serbaneka', 'pelayanan', 'pelayanan',
       'pelayanan', 'pelayanan', 'pelayanan', 'anekdot/serbaneka',
       'pelayanan', 'sistem', 'pelayanan', 'anekdot/serbaneka',
       'pelayanan', 'pelayanan', 'anekdot/serbaneka', 'sistem',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'pelayanan', 'pelayanan',
       'sistem', 'anekdot/serbaneka', 'pelayanan', 'anekdot/serbaneka',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'anekdot/serbaneka',
       'anekdot/serbaneka', 'pelayanan', 'anekdot/serbaneka', 'pelayanan',
       'pelayanan', 'pelayanan', 'anekdot/serbaneka', 'pelayanan',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'pelayanan', 'pelayanan',
       'anekdot/serbaneka', 'pelayanan', 'pelayanan', 'pelayanan',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'pelayanan',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'pelayanan', 'pelayanan',
       'pelayanan', 'pelayanan', 'anekdot/serbaneka', 'anekdot/serbaneka',
       'anekdot/serbaneka

In [23]:
y_pred_cat

array(['anekdot/serbaneka', 'anekdot/serbaneka', 'pelayanan',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'anekdot/serbaneka',
       'anekdot/serbaneka', 'pelayanan', 'pelayanan', 'pelayanan',
       'pelayanan', 'pelayanan', 'anekdot/serbaneka', 'pelayanan',
       'pelayanan', 'anekdot/serbaneka', 'pelayanan', 'sistem',
       'pelayanan', 'pelayanan', 'pelayanan', 'anekdot/serbaneka',
       'pelayanan', 'anekdot/serbaneka', 'anekdot/serbaneka', 'pelayanan',
       'anekdot/serbaneka', 'pelayanan', 'anekdot/serbaneka', 'pelayanan',
       'pelayanan', 'anekdot/serbaneka', 'pelayanan', 'sistem',
       'pelayanan', 'pelayanan', 'pelayanan', 'pelayanan', 'pelayanan',
       'anekdot/serbaneka', 'pelayanan', 'pelayanan', 'pelayanan',
       'pelayanan', 'anekdot/serbaneka', 'anekdot/serbaneka',
       'anekdot/serbaneka', 'anekdot/serbaneka', 'sistem',
       'anekdot/serbaneka', 'pelayanan', 'pelayanan', 'pelayanan',
       'pelayanan', 'pelayanan', 'sistem', 'pelayanan', 'pelay

In [24]:
tes1 = pd.Series(y_pred_cat)
tes1.to_csv('data/data/predicted_category.csv', index=False)

In [25]:
pickle.dump(y_pred_cat, open('pred_cat', 'wb'))

In [26]:
# Proses pengujian
# Import library yang dibutuhkan
from sklearn import linear_model, metrics
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [27]:
# comparing actual response values (y_train) with predicted response values (y_pred_train) 
metrics.accuracy_score(y_train, y_pred_train)*100

95.3125

In [28]:
print(y_train.shape, y_pred_train.shape)
print(y_test.shape, y_pred_cat.shape)

(640,) (640,)
(160,) (160,)


In [29]:
# comparing actual response values (y_test) with predicted response values (y_pred_cat) 
metrics.accuracy_score(y_test, y_pred_cat)*100

58.75

# 2.0 Classification report

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_cat ,target_names=catagories))

                   precision    recall  f1-score   support

        pelayanan       0.62      0.60      0.61        65
            harga       0.00      0.00      0.00         4
           sistem       0.57      0.69      0.63        74
anekdot/serbaneka       0.50      0.24      0.32        17

         accuracy                           0.59       160
        macro avg       0.42      0.38      0.39       160
     weighted avg       0.57      0.59      0.57       160



  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Melakukan proses pengujian
acc = accuracy_score(y_test, y_pred_cat)
cm = confusion_matrix(y_test, y_pred_cat)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,y_pred_cat,average='weighted')))
print("Recall Score: "+ str(recall_score(y_test,y_pred_cat,average='weighted')))
print("F1 Score: "+ str(f1_score(y_test,y_pred_cat,average='weighted')))

Accuracy Score: 0.5875
Confusion Matrix: [[39  0 23  3]
 [ 0  0  4  0]
 [22  0 51  1]
 [ 2  0 11  4]]
Precision Score: 0.5696411851257357
Recall Score: 0.5875
F1 Score: 0.5709757716641104


  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
# Menguji model prediksi yang telah di buat
# dengan data masukan yang baru
tes = ['jelek aplikasinya',
       'pelayanannya bagus, dokter baik',
       'bagus berfungsi dengan baik',
       'bayarnya murah sekali',
      'tolong tambahin metode pembayarannya jelek amat']
# tes_features = vectorizer.transform(tes).toarray()

est = vectorizer.transform(tes)
test_aspect = pd.DataFrame(est.toarray(), columns=vectorizer.get_feature_names())
test_aspect.to_csv("data/data/fitur_test_aspect.csv")
# print(tes_features)
print("Review ini termasuk kategori "+etc.predict(test_aspect))

['Review ini termasuk kategori anekdot/serbaneka'
 'Review ini termasuk kategori pelayanan'
 'Review ini termasuk kategori pelayanan'
 'Review ini termasuk kategori harga'
 'Review ini termasuk kategori anekdot/serbaneka']


In [33]:
from sklearn.model_selection import train_test_split

X = data['review']
y = data['polarity']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [34]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(640,) (640,)
(160,) (160,)


In [35]:
X_train.head()

264    baik tangan suka panik diagnose baik langsung ...
615                                          cepat bagus
329    mudah repot repot antre antre rumah sakit klin...
342                                          bagus layan
394    konsultasi bayar pakai gopay gagal konsultasi ...
Name: review, dtype: object

In [36]:
X_test.head()

696                                    bantu mudah bagus
667                                              manfaat
63                       bagus konsultasi psikologi baik
533    puas bantu terimakasi aplikasi halodic dokter ...
66     bantu aplikasi lelah rumah bantu jelek aplikas...
Name: review, dtype: object

In [37]:
y_train.head()

264    positif
615    positif
329    positif
342    positif
394    negatif
Name: polarity, dtype: object

In [38]:
y_train.head()

264    positif
615    positif
329    positif
342    positif
394    negatif
Name: polarity, dtype: object

In [39]:
etc = ExtraTreesClassifier(criterion='entropy',class_weight='balanced_subsample', max_features=10)

# Memasang data latih ke classifier
etc.fit(train_data_features,y_train)

ExtraTreesClassifier(class_weight='balanced_subsample', criterion='entropy',
                     max_features=10)

In [40]:
# Melakukan prediksi dengan membandingkan dengan data tes
pred = etc.predict(train_data_features)

In [41]:
# Melakukan prediksi dengan membandingkan dengan data tes
pred_polarity = etc.predict(test_data_features)

In [42]:
pickle.dump(pred_polarity, open('pred_polarity', 'wb'))

In [43]:
tes2 = pd.Series(pred_polarity)
tes2.to_csv('data/data/predicted_polarity.csv', index=False)

In [44]:
# comparing actual response values (y_train) with predicted response values (y_pred_train) 
 
metrics.accuracy_score(y_train, pred)*100

100.0

In [45]:
# comparing actual response values (y_train) with predicted response values (y_pred_train) 
 
metrics.accuracy_score(y_test, pred_polarity)*100

91.875

In [46]:
# Melakukan proses pengujian
acc = accuracy_score(y_test, pred_polarity)
cm = confusion_matrix(y_test, pred_polarity)
print("Accuracy Score: " + str(acc))
print("Confusion Matrix: "+ str(cm))
print("Precision Score: "+ str(precision_score(y_test,pred_polarity,average='weighted')))
print("Recall Score: "+ str(recall_score(y_test,pred_polarity,average='weighted')))
print("F1 Score: "+ str(f1_score(y_test,pred_polarity,average='weighted')))

Accuracy Score: 0.91875
Confusion Matrix: [[ 21  10]
 [  3 126]]
Precision Score: 0.9164981617647058
Recall Score: 0.91875
F1 Score: 0.9146526586620926


In [47]:
# Menguji model prediksi yang telah di buat
# dengan data masukan yang baru
tes1 = ['jelek aplikasinya',
       'pelayanannya bagus, dokter baik',
       'bagus berfungsi dengan baik',
       'bayarnya murah sekali',
       'tolong tambahin metode pembayarannya jelek amat']

#preprocessing
tes1 = [review.lower() for review in tes1]

# tes_features = vectorizer.transform(tes).toarray()
est1 = vectorizer.transform(tes1)
test_sentiment_terms = pd.DataFrame(est1.toarray(), columns=vectorizer.get_feature_names())
test_sentiment_terms.to_csv("data/data/fitur_test_sentiment_terms.csv")
# print(tes_features)
print("Nilai sentiment review ini adalah "+ etc.predict(test_sentiment_terms))

['Nilai sentiment review ini adalah negatif'
 'Nilai sentiment review ini adalah positif'
 'Nilai sentiment review ini adalah positif'
 'Nilai sentiment review ini adalah positif'
 'Nilai sentiment review ini adalah negatif']


In [48]:
def create_result_dataframe(pred_1,pred_2):
  # Calling DataFrame constructor on predicted outputs
  resultant_df = pd.DataFrame(list(zip(pred_1,pred_2)), columns = ["predicted_catagories" , "predicted_polarity"])
  result  = pd.crosstab(resultant_df.predicted_catagories,resultant_df.predicted_polarity ,margins = True , margins_name = "Total")
  result["Ranking"] = ( result.Total/resultant_df.shape[0]) * 5.0
  result["Negatif in %"] = (result.negatif/result.Total) * 100
  result["Positif in %"] = (result.positif/result.Total) * 100
  del result["negatif"]
  del result["positif"]
  del result["Total"]

  return result

In [49]:
result = create_result_dataframe(y_pred_cat,pred_polarity)

In [50]:
result

predicted_polarity,Ranking,Negatif in %,Positif in %
predicted_catagories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anekdot/serbaneka,1.96875,7.936508,92.063492
pelayanan,2.78125,17.977528,82.022472
sistem,0.25,37.5,62.5
Total,5.0,15.0,85.0


Hasil di atas menunjukkan bahwa Pemeringkatan dan semua Polaritas dalam persentase untuk semua kategori aspek.

Hasil observasi menandakan bahwa:
- Pelayanan memiliki ranking tertingi dengan nilai 2.53
- Sistem memiliki persentase negatif tertinggi dengan nilai 55%
- Anekdot/Serbaneka memiliki persentase positif tertinggi dengan nilai 94.2% dan Pelayanan dengan 80.2% hampir menyusul
- Harga tidak masuk karena kekurangan data

Klasifikasi kategori aspek 60%
Klasifikasi polaritas 92%