# Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

# Open Dataset

In [2]:
df = pd.read_csv('data_clean.csv')
df

Unnamed: 0,data,label
0,ayo klaim voucher diskon ribunya biar tanggal ...,1
1,gini jajan ortu gak dehh ikutin mimin aplikasi...,1
2,instal snack video diplaystore ketuk tautan,1
3,buka aplikasi snackvideo log in fb no hp klik ...,1
4,tips and trik join grupnya postingan hilang,1
...,...,...
994,selamat siangizin gabung izin sharedijual tana...,1
995,dijual tanah sawah pinggir jalan rayabisa sawa...,1
996,rumah dijual bogor kota spesifikasi gambar hub...,1
997,jual tanah luas hektarsurat girikjual harga di...,1


# Cek Missing Value

In [3]:
df.isna().sum()

data     2
label    0
dtype: int64

# Drop Missing Value

In [4]:
df.dropna(inplace=True)
df.isna().sum()

data     0
label    0
dtype: int64

# Inisiasi X, dan y

In [5]:
X = df['data']
y = df['label']

# Model Selection (data latih, dan data uji)

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [7]:
X_test

665    assalamualaikum selamat siang alhamdulillah pl...
352    qnu platinum speaker quran lengkap juz dilengk...
255        menjual puluh s trip potongan dapatkan shopee
712    alhamdulillah cairkan semoga sehat lancarkan r...
993                                    open vcs berbayar
                             ...                        
894    dijual motor mio surat komplit pajak hidup tan...
308                           good day mocacinno renceng
524                                    gas dapet banyakk
776    fresh tertinggi aja leg lesley zodiak colect y...
164                                            pramugari
Name: data, Length: 200, dtype: object

## TF IDF

In [8]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)

In [9]:
X_train_tfidf_series = pd.Series(X_train_tfidf.toarray()[0])
X_train_tfidf_series


0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3507    0.0
3508    0.0
3509    0.0
3510    0.0
3511    0.0
Length: 3512, dtype: float64

In [10]:
model = Pipeline([(('tfidf'),TfidfVectorizer()),('svc',SVC(C=1000.0,kernel='linear'))])
model.fit(X=X_train,y=y_train)

In [11]:
predict = model.predict(X_test)

In [12]:
predict

array([3, 1, 1, 3, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 3, 1, 1, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1,
       1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 1, 1, 3, 1, 3, 1, 3, 1, 1, 1, 1, 3,
       1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 3, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 1, 1,
       1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 3,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [13]:
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           1       0.95      1.00      0.97       156
           2       0.88      0.50      0.64        14
           3       1.00      0.93      0.97        30

    accuracy                           0.95       200
   macro avg       0.94      0.81      0.86       200
weighted avg       0.95      0.95      0.95       200



In [14]:
import joblib
joblib.dump(value=model,filename='svm_last.joblib')

['svm_last.joblib']

In [15]:
print("Hello world")

Hello world
