In [1]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_csv('/content/tes.csv')

In [3]:
df.shape

(365, 2)

# Preprocessing

In [16]:
df = df.dropna()

In [17]:
df.columns = ['tweet', 'tipe']

In [18]:
jt = df['tweet'].shape
jtipe = df['tipe'].shape
print("Jumlah data Tweet: ", jt[0], " Dan jumlah Data Tipe: ", jtipe[0])

Jumlah data Tweet:  195  Dan jumlah Data Tipe:  195


# MODELING

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [19]:
x = df['tweet']
y = df['tipe']

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [21]:
print(f'Jumlah Data Training: {len(x_train)}')
print(f'Jumlah Data Test: {len(x_test)}')
print(f'Jumlah Total Data: {len(df)}')

Jumlah Data Training: 136
Jumlah Data Test: 59
Jumlah Total Data: 195


In [22]:
vectorizer = TfidfVectorizer()

x_train_vectorized = vectorizer.fit_transform(x_train.values.astype('U'))
x_test_vectorized = vectorizer.transform(x_test.values.astype('U'))

In [None]:
feature_names = vectorizer.get_feature_names_out()

print("Hasil dari transformasi data pelatihan:")
print(x_train_vectorized)

print("\nFitur dari vektor tfidf:")
print(feature_names)

In [34]:
model = SVC(kernel='linear', C = 1.0, gamma=0.001)

In [35]:
model.fit(x_train_vectorized, y_train)

In [36]:
y_pred = model.predict(x_test_vectorized)

print(y_pred[:10])

[4 3 4 4 3 4 3 3 4 4]


In [37]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.97


# TRYING GRIDSEARCHCV

In [28]:
krn = ['linear', 'poly', 'rbf', 'sigmoid']
rng_C = np.arange(1,52,10)
rng_gamma = np.logspace(-3, 2, 5)

In [None]:
best_score = 0
for i in krn:
  for j in rng_C:
    for k in rng_gamma:
      model=SVC(kernel=i, C=j, gamma=k)
      model.fit(x_train_vectorized, y_train)
      y_pred = model.predict(x_test_vectorized)
      acc_score = accuracy_score(y_test,y_pred)
      if best_score<acc_score:
        best_score=acc_score
        bi=i
        bj=j
        bk=k
print(best_score,bi,bj,bk)

In [29]:
param = {'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
         'C': np.arange(1,52,10),
         'gamma': np.logspace(-3, 2, 5)}

In [30]:
model = SVC()
grids = GridSearchCV(model, param, cv=3)

In [31]:
grids.fit(x_train_vectorized, y_train)

In [33]:
grids.best_params_

{'C': 1, 'gamma': 0.001, 'kernel': 'linear'}