In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle

In [49]:
data = pd.read_csv("dataset/dataset_clean_labelled.csv")

In [48]:
sw = [line.rstrip('\n') for line in open('raw/stopword')]

In [50]:
trainX, testX, trainY, testY = train_test_split(data["TWEET"].values.astype(str), data["LABEL"].values.astype(int), test_size=0.2)

In [51]:
vectorizer = TfidfVectorizer(stop_words=sw)
x = vectorizer.fit_transform(data["TWEET"].values.astype(str))

x_train = vectorizer.transform(trainX)
x_test = vectorizer.transform(testX)

In [118]:
clf = SGDClassifier(loss="log")
clf.fit(x_train, trainY) 

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [119]:
pre = clf.predict_proba(x_test)
hasil = np.argmax(pre, axis=1)

In [121]:
accuracy_score(testY, hasil)

0.788235294117647

In [111]:
precision_score(testY, hasil)

0.7407407407407407

In [112]:
recall_score(testY, hasil)

0.9090909090909091

In [113]:
f1_score(testY, hasil)

0.8163265306122449

In [105]:
filename_model = 'svm.mdl'
pickle.dump(clf, open("model/"+filename_model, 'wb'))

In [106]:
filename_vectorizer = 'vectorizer.mdl'
pickle.dump(vectorizer, open("model/"+filename_vectorizer, 'wb'))

In [3]:
loaded_model = pickle.load(open("model/svm.mdl", 'rb'))

In [4]:
loaded_vectorizer = pickle.load(open("model/vectorizer.mdl", 'rb'))

In [33]:
d = loaded_vectorizer.transform(['qwerty']).toarray()[0]

In [38]:
loaded_vectorizer.transform(["bangsat"])

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [37]:
loaded_vectorizer.fit_transform(["aih"])

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [36]:
c.any() == d.any()

True

In [67]:
p = clf.predict_proba(vectorizer.transform(["bangsat"]))

In [68]:
np.argmax(p[0], axis=0)

0

In [69]:
vectorizer.vocabulary_

{'genre': 447,
 'musik': 925,
 'energic': 387,
 'ngeneki': 969,
 'sabi': 1172,
 'thats': 1361,
 'mawang': 821,
 'ash': 71,
 'mbuhlah': 824,
 'berusaha': 188,
 'senetral': 1217,
 'sayang': 1190,
 'idol': 520,
 'korea': 712,
 'saudaraan': 1189,
 'keberatan': 642,
 'bkalo': 208,
 'disandarkan': 353,
 'hiphop': 507,
 'mulia': 917,
 'beliau': 136,
 'sedih': 1202,
 'banget': 111,
 'temen': 1328,
 'hina': 506,
 'sukain': 1290,
 'tau': 1316,
 'benci': 138,
 'bang': 110,
 'kpop': 722,
 'yaa': 1462,
 'suka': 1288,
 'ide': 518,
 'pecah': 1062,
 'sih': 1248,
 'sepengalaman': 1228,
 'edukasi': 376,
 'jenis': 587,
 'apapun': 59,
 'rock': 1166,
 'ballad': 106,
 'terbaik': 1334,
 'yaallah': 1466,
 'jodoh': 598,
 'aesthetic': 9,
 'yaallahyang': 1467,
 'moto': 905,
 'alam': 22,
 'absurd': 3,
 'seni': 1218,
 'lagu': 738,
 'dikenalin': 336,
 'almarhum': 30,
 'sepupu': 1229,
 'kenal': 661,
 'bangsat': 115,
 'berdiri': 158,
 'ranah': 1143,
 'mu': 909,
 'mendiskriminasikan': 854,
 'orang': 1029,
 'berdasarka

In [71]:
len(vectorizer.idf_)

1476

In [77]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': ['ada',
  'adalah',
  'adanya',
  'adapun',
  'agak',
  'agaknya',
  'agar',
  'akan',
  'akankah',
  'akhir',
  'akhiri',
  'akhirnya',
  'aku',
  'akulah',
  'amat',
  'amatlah',
  'anda',
  'andalah',
  'antar',
  'antara',
  'antaranya',
  'apa',
  'apaan',
  'apabila',
  'apakah',
  'apalagi',
  'apatah',
  'artinya',
  'asal',
  'asalkan',
  'atas',
  'atau',
  'ataukah',
  'ataupun',
  'awal',
  'awalnya',
  'bagai',
  'bagaikan',
  'bagaimana',
  'bagaimanakah',
  'bagaimanapun',
  'bagi',
  'bagian',
  'bahkan',
  'bahwa',
  'bahwasanya',
  'baik',
  'bakal',
  'bakalan',
  'balik',
  'banyak',
  'bapak',
  'baru',
  'bawah',
  'beberapa',
  'begini',
  'beginian',
  'beginikah