In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from google.colab import files
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
np.random.seed(1337)

## Called Training and Test Data

In [2]:
my_df_training = pd.read_csv('clean_tweet_training.csv')
tweet_text_training = my_df_training['text']
target_training = my_df_training['target']
X_training = tweet_text_training[pd.notnull(tweet_text_training)]
Y_training = target_training[pd.notnull(tweet_text_training)]
X_train_part1, X_train_part2, Y_train_part1, Y_train_part2 = train_test_split(X_training, Y_training, train_size=0.01, random_state=42)

my_df_test = pd.read_csv('clean-tweet-test-fixed.csv')
my_df_test = my_df_test[my_df_test.target != 2]
tweet_text_test = my_df_test['text']
target_test = my_df_test['target']
X_test = tweet_text_test[pd.notnull(tweet_text_test)]
Y_test = target_test[pd.notnull(tweet_text_test)]



In [3]:
len(X_train_part1)

15927

## Feature Extraction using Bag-of-Word <br>
Untuk experiment ini, kita hanya akan menggunakan 700 kata yang paling sering muncul sebagai feature

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=700)
X_training_vector = vectorizer.fit_transform(X_train_part1)
print("Number of features:  %d" % len(vectorizer.vocabulary_))
X_test_vector = vectorizer.transform(X_test)

Number of features:  700


## Feature Scaling
Sebelum melakukan klasifikasi, sebaiknya dilakukan feature scaling agar nilai feature seragam (tidak terlalu besar atau terlalu kecil)

In [5]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler(with_mean = False)  
scaler.fit(X_training_vector)

X_training_vector = scaler.transform(X_training_vector)  
X_test_vector = scaler.transform(X_test_vector)  



## Classification using SVM <br>
Klasifikasi menggunakan SVM dengan linear kernel

In [6]:
from sklearn import svm
from sklearn.metrics import accuracy_score
clf = svm.SVC(kernel='linear')
print("Training Classifier...")
clf.fit(X_training_vector, Y_train_part1)
print("Predicting...")
prediction = clf.predict(X_test_vector)
accuracy = accuracy_score(Y_test, prediction)
print ('Accuracy:', accuracy)

Training Classifier...
Predicting...
Accuracy: 0.7493036211699164


## 

## Saving Model 

In [7]:
import pickle
filename = 'SVM_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [8]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test_vector, Y_test)
print(result)

0.7493036211699164


array(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), dtype=object)