In [1]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('../data/train_dataset.csv')

In [3]:
def preprocess_text_vi(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = text.split()
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def numerize(df):
    df['label'] = 0
    df.loc[df['genre'] == "giao-duc", "label"] = 1
    df.loc[df['genre'] == "xe", "label"] = 2
    df.loc[df['genre'] == "suc-khoe", "label"] = 3
    df.loc[df['genre'] == "cong-nghe-game", "label"] = 4

In [4]:
df['title'] = df['title'].apply(lambda x: preprocess_text_vi(x))
numerize(df)

In [5]:
from sklearn.model_selection import GridSearchCV

X = df['title']
y = df['label']

param_grid = {
    'tfidf__ngram_range': [(1, n) for n in range(1, 3)],
    'tfidf__min_df': np.arange(1, 4, 1),
    'clf__estimator__kernel': ['rbf'],
    'clf__estimator__C': [0.1, 1, 10, 100],  # Values for C parameter
    'clf__estimator__gamma': ['scale', 'auto', 0.1, 0.01]  # Values for gamma parameter
}

In [6]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(SVC()))
])

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy')

grid_search.fit(X, y)

print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

Best parameters found:  {'clf__estimator__C': 10, 'clf__estimator__gamma': 'scale', 'clf__estimator__kernel': 'rbf', 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2)}
Best accuracy found:  0.8679997362632093
