In [11]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [12]:
train = pd.read_csv('ReutersGrain-train.csv', quotechar="'", escapechar="\\")
test = pd.read_csv('ReutersGrain-test.csv', quotechar="'", escapechar="\\")

X_train, y_train = train.iloc[:, 0], train.iloc[:, 1]
X_test, y_test = test.iloc[:, 0], test.iloc[:, 1]

In [13]:
nltk.download("punkt")
nltk.download("stopwords")

def preprocess_text(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stemmed_tokens)

X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed  = [preprocess_text(text) for text in X_test]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pedro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pedro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train_preprocessed)
X_test_transformed = vectorizer.transform(X_test_preprocessed)

In [15]:
classifier = SVC()

param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid'] 
}

grid_search = GridSearchCV(classifier, param_grid, refit=True, verbose=1, cv=5)

grid_search.fit(X_train_transformed, y_train)

best_classifier = grid_search.best_estimator_

y_pred = best_classifier.predict(X_test_transformed)

accuracy = accuracy_score(y_test, y_pred)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'C': 100, 'gamma': 0.1, 'kernel': 'sigmoid'}
Accuracy: 0.9768211920529801
