In [1]:
import pandas as pd
import numpy as np
from normalizationModul5 import normalize_corpus
from utilsModul5 import build_feature_matrix

import nltk
import string
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models import Word2Vec
from gensim import models



In [2]:
dataset = pd.read_csv('pesawatlionair.csv')
dataset.head()

Unnamed: 0,waktu,tweets,label
0,2018-11-03 23:40:24,@AgisniNina @rmiryanti Tetap #01JokowiLagi #01...,negatif
1,2018-11-03 18:02:36,"Ya, usia nggak ada yang tahu. Setidaknya kita ...",positif
2,2018-11-03 17:58:33,Saya menjual Jaket Hoodie Asian Games 2018. ma...,negatif
3,2018-11-03 17:50:31,Bayangin saja dalam waktu 90 detik kamu harus ...,positif
4,2018-11-03 17:42:47,"Duka mendalam atas meninggalnya Syahrul Anto, ...",positif


In [3]:
feature = dataset.iloc[:,1]
label = dataset.iloc[:,2]
print(feature[1:5])
print("-----------------------------")
print(label[1:5])

1    Ya, usia nggak ada yang tahu. Setidaknya kita ...
2    Saya menjual Jaket Hoodie Asian Games 2018. ma...
3    Bayangin saja dalam waktu 90 detik kamu harus ...
4    Duka mendalam atas meninggalnya Syahrul Anto, ...
Name: tweets, dtype: object
-----------------------------
1    positif
2    negatif
3    positif
4    positif
Name: label, dtype: object


In [4]:
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                       test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y

def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)
    return filtered_corpus, filtered_labels

In [5]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.cross_validation import train_test_split



In [6]:
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(feature,
                                                                       label,
                                                                       test_data_proportion=0.1)

In [7]:
train_corpus

879     Turut bersimpati atas musibah penerbangan Lion...
239     Lion Air telah memesan pesawat Boeing 737 Max ...
361     Ucapan takziah diucapkan kepada keluarga mangs...
211     Selamat jalan sahabat, beristirahatlah dalam d...
788     teruuuss...dgn entengnya kau buat status..klu ...
445     Para delegasi yang menghadiri acara Konferensi...
530     Semoga Allah mudahkan urusan balik semenanjung...
227     Kemendagri Mudahkan Pengurusan Akta Kematian K...
941     Presiden telah menginstruksikan Kepala BNPP be...
572     Semoga semuanya segera ditemukan #LionAir #Pra...
423     Satu Jenazah Korban Lion Air Teridentifikasi, ...
874     Sulitnya Pencarian Badan Lion Air JT 610 Didug...
177         semoga tabir segera tersingkap #PrayForJT610 
1026    #PrayForJT610 Pesawat Lion Air JT 610 jatuh di...
895     Rencananya, malam ini Persib akan bertanding m...
449     Panglima TNI Marsekal Hadi Tjahjanto mengataka...
543     Pesawat JT-610 yang jatuh mempunyai Certificat...
774     Ya All

In [8]:
test_corpus

31      Abis baca berita, tim penyelam #LionAirJT610 d...
413     #JT610\nTerakhir jadi satu sama kak mery sama ...
536                         Sudah dua hari #PrayForJT610'
960     Pemerintah lakukan upaya terbaik untuk menemuk...
793     Turut Berduka Cita #PrayForJT610 #AllahTheMerc...
740     MENGENAL KOTAK HITAM\n\nUntuk mengungkap penye...
950     Presiden jokowi sudah perintahkan basarnas. TN...
721     Di luar agenda kepresidenan, Presiden Jokowi p...
86      Baper gara" JT 610 \xf0\x9f\x98\xad #PrayForLi...
876     nggak hanya langit yang mendung hari ini,hati ...
113     Presiden @jokowi tinjau Posko Evakuasi terpadu...
1056    Doa kita untuk #JT610, semoga amal i an diteri...
1033    Hanya doa yang bisa kita jadikan jalan untuk m...
784     Semoga diberi kemudahan untuk pencaharian dan ...
306     @mtrmkg10 Yamaha sama #PrayForJT610 apa hubung...
1040    Semoga badan pesawat dan para korban bisa sege...
885     Basarnas Berhasil Menemukan 26 Jenazah Korban ...
425     #PrayF

In [9]:
norm_train_corpus = normalize_corpus(train_corpus,
                                      lemmatize=True,
                                      only_text_chars=True)
norm_test_corpus = normalize_corpus(test_corpus,
                                      lemmatize=True,
                                      only_text_chars=True)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [11]:
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                  smooth_idf=True,
                                  use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

In [12]:
def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [13]:
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

tokenized_train = [nltk.word_tokenize(text)
                  for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                 for text in norm_test_corpus]

model = gensim.models.Word2Vec(tokenized_train,
                              size=500,
                              window=100,
                              min_count=30,
                              sample=1e-3)


In [14]:
from sklearn import metrics
import numpy as np

def get_metrics(true_labels, predicted_labels):
    print('Accuracy: ', np.round(metrics.accuracy_score(true_labels,
                                                     predicted_labels),2))
    print('Precision: ', np.round(metrics.precision_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    print('Recall: ', np.round(metrics.recall_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    print('F1 Score: ', np.round(metrics.f1_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    

In [15]:
#def train_predict_evaluate_model(classifier,
#                                train_features, train_labels,
#                                test_features, test_labels):
#    classifier.fit(train_features, train_labels)
#    predictions = classifier.predict(test_features)
#    get_metrics(true_labels=test_labels,
#               predicted_labels=predictions)
#    return predictions#

In [16]:
from sklearn.linear_model import SGDClassifier

train_features=tfidf_train_features
train_labels=train_labels
test_features=tfidf_test_features
test_labels=test_labels

clsfr = SGDClassifier(loss='hinge', n_iter=100)
clsfr.fit(train_features, train_labels)
predictions = clsfr.predict(test_features)
    
print("Label test: " +format(test_labels[:10]))
print("Prediction test: "+ format(predictions[:5]))
print("Accuration: "+format(clsfr.score(test_features,test_labels)))




Label test: 31     positif
413    positif
536     netral
960    positif
793    positif
740     netral
950     netral
721     netral
86      netral
876     netral
Name: label, dtype: object
Prediction test: ['netral' 'netral' 'netral' 'positif' 'positif']
Accuration: 0.6182336182336182


In [17]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(tfidf_vectorizer,clsfr)
pipe.fit(train_corpus,train_labels)

print(pipe.score(test_corpus,test_labels))

tempFeature=normalize_corpus(feature)



0.6182336182336182


In [18]:
tempData=pd.DataFrame(np.column_stack([feature,label]),columns=('Feature','Label'))
tempData
tempRest = [pipe,tempData]
line = np.array(['Siapa aku ini? Yang bukan meenjadi siapa - siapa bagimu'])
pipe.predict(line)
joblibFile = "E:\SGDClassifierSentence.pkl"
from sklearn.externals import joblib
joblib.dump(tempRest,joblibFile)

['E:\\SGDClassifierSentence.pkl']

In [19]:
def get_metrics(true_labels, predicted_labels):
    print('Accuracy: ', np.round(metrics.accuracy_score(true_labels,
                                                     predicted_labels),2))
    print('Precision: ', np.round(metrics.precision_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    print('Recall: ', np.round(metrics.recall_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
    print('F1 Score: ', np.round(metrics.f1_score(true_labels,
                                                     predicted_labels,
                                                        average='weighted'),2))
get_metrics(true_labels=test_labels,predicted_labels=predictions)

Accuracy:  0.62
Precision:  0.61
Recall:  0.62
F1 Score:  0.61


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
