# Ekstraksi Fitur

In [1]:
import pandas as pd

from joblib import dump, load
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('data/keluhan_processed.csv', encoding='ISO-8859-1')
data = data[(data['Keluhan'] == 1) | (data['Respon'] == 1) | (data['Other'] == 1)]

label = []
for x in data.values:
    if x[1] == 1:
        label.append(1.0)
    elif x[2] == 1:
        label.append(2.0)
    elif x[3] == 1:
        label.append(3.0)

data.insert(loc=1, column='Label', value=label)
data = data.loc[:, 'TweetProcessed':'Label']

X = data['TweetProcessed']
y = data['Label']

### Vektor dokumen (X1)

In [3]:
vectorizer = CountVectorizer(ngram_range=(2,2))
X1 = vectorizer.fit_transform(X)
dump(vectorizer, 'model/vectorizer/vec1.joblib')

['model/vectorizer/vec1.joblib']

### Vektor TF IDF tanpa normalisasi (X2)

In [4]:
vectorizer = TfidfVectorizer(ngram_range=(2,2), norm=None)
X2 = vectorizer.fit_transform(X)
dump(vectorizer, 'model/vectorizer/vec2.joblib')

['model/vectorizer/vec2.joblib']

### Vektor TF IDF dengan normalisasi (X3)

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(2,2))
X3 = vectorizer.fit_transform(X)
dump(vectorizer, 'model/vectorizer/vec3.joblib')

['model/vectorizer/vec3.joblib']

# _Modeling_

In [6]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.2, random_state=0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.2, random_state=1)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.2, random_state=2)

## _Decision tree model_

In [9]:
tree_classifier = DecisionTreeClassifier(random_state=0)

### Train: X1

In [20]:
tree_classifier.fit(X1_train, y1_train)
dump(tree_classifier, 'model/tree/tree1.joblib')

prediction = tree_classifier.predict(X1_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y1_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y1_test))

[3. 3. 2. 1. 3. 3. 3. 3. 2. 1.]
F1 score:  0.8590568569417046
Accuracy:  0.8371010638297872


### Train: X2

In [21]:
tree_classifier.fit(X2_train, y2_train)
dump(tree_classifier, 'model/tree/tree2.joblib')

prediction = tree_classifier.predict(X2_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y2_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y2_test))

[1. 3. 2. 1. 3. 3. 1. 3. 1. 3.]
F1 score:  0.8533065656853438
Accuracy:  0.8331117021276596


### Train: X3

In [22]:
tree_classifier.fit(X3_train, y3_train)
dump(tree_classifier, 'model/tree/tree3.joblib')

prediction = tree_classifier.predict(X3_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y3_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y3_test))

[3. 3. 1. 3. 3. 3. 3. 2. 1. 3.]
F1 score:  0.8396512615240778
Accuracy:  0.8164893617021277


## _SVM model_

In [24]:
svm_classifier = LinearSVC(random_state=0, max_iter=5000)

### Train: X1

In [25]:
svm_classifier.fit(X1_train, y1_train)
dump(svm_classifier, 'model/svm/svm1.joblib')

prediction = svm_classifier.predict(X1_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y1_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y1_test))

[3. 3. 2. 1. 3. 3. 3. 3. 2. 3.]
F1 score:  0.8902226740284904
Accuracy:  0.8710106382978723


### Train: X2

In [27]:
svm_classifier.fit(X2_train, y2_train)
dump(svm_classifier, 'model/svm/svm2.joblib')

prediction = svm_classifier.predict(X2_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y2_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y2_test))

[1. 3. 2. 1. 3. 3. 3. 3. 1. 3.]
F1 score:  0.873183262502148
Accuracy:  0.8543882978723404


### Train: X3

In [29]:
svm_classifier.fit(X3_train, y3_train)
dump(svm_classifier, 'model/svm/svm3.joblib')

prediction = svm_classifier.predict(X3_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y3_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y3_test))

[3. 3. 1. 3. 1. 3. 3. 2. 1. 3.]
F1 score:  0.8864070103949535
Accuracy:  0.8617021276595744


## _MLP model_

In [30]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 50), random_state=0)

### Train: X1

In [31]:
mlp_classifier.fit(X1_train, y1_train)
dump(mlp_classifier, 'model/mlp/mlp1.joblib')

prediction = mlp_classifier.predict(X1_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y1_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y1_test))

[3. 3. 2. 1. 3. 3. 3. 3. 2. 1.]
F1 score:  0.8853673145880037
Accuracy:  0.8656914893617021


### Train: X2

In [32]:
mlp_classifier.fit(X2_train, y2_train)
dump(mlp_classifier, 'model/mlp/mlp2.joblib')

prediction = mlp_classifier.predict(X2_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y2_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y2_test))

[1. 3. 2. 1. 3. 1. 3. 1. 1. 3.]
F1 score:  0.8744004322517958
Accuracy:  0.8517287234042553


### Train: X3

In [33]:
mlp_classifier.fit(X3_train, y3_train)
dump(mlp_classifier, 'model/mlp/mlp3.joblib')

prediction = mlp_classifier.predict(X3_test)
print(prediction[:10])
print('F1 score: ', f1_score(prediction, y3_test, average='macro'))
print('Accuracy: ', accuracy_score(prediction, y3_test))

[1. 3. 3. 3. 1. 3. 3. 2. 1. 3.]
F1 score:  0.8884593533568904
Accuracy:  0.8663563829787234
