# Hyperparameter tuning

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

### Priprava podatkov

In [None]:
df = pd.read_csv("./datasets/cleaned.csv")
df = df[:15000].copy()

X = df.drop('category', axis=1)
y = pd.Categorical(df['category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train['clean_text'].values.astype('U'))
X_test_vec = vectorizer.transform(X_test['clean_text'].values.astype('U'))

### Hyper-parameter tuning for Logistic Regression

In [None]:
param_grid = {
   'C': [0.001, 0.01, 0.1, 1, 10, 100],
   'penalty': ['l1', 'l2', 'elasticnet'],
   'solver': ['newton-cg', 'liblinear', 'sag', 'saga'],
   'max_iter': [1000, 10000],
   'random_state': [42]
}

# Create a base model
lr = LogisticRegression()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train_vec, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

### Hyper-parameter tuning for Boosting

In [None]:
param_grid = {
    'n_estimators': [10, 40, 50, 100],
    'learning_rate': [10, 50, 100, 200, 350, 500]
}
boost = AdaBoostClassifier()
grid_search = GridSearchCV(boost, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_vec, y_train)

best_params = grid_search.best_estimator_

accuracy = best_params.score(X_test_vec, y_test)
print(f"Best estimators: {grid_search.best_params_}, Best Model Accuracy: {accuracy}")

### Hyper-parameter tuning for Naive Bayes

In [None]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0],
    'fit_prior': [True, False]
}

nb_classifier = MultinomialNB()
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_vec, y_train)

accuracy = best_params.score(X_test_vec, y_test)
print(f"Best estimators: {grid_search.best_params_}, Best Model Accuracy: {accuracy}")

### Hyper-parameter tuning for kNN

In [None]:
param_grid = {
    'k': range(1,25)
}

knn_classifier = KNeighborsClassifier()
grid_search = GridSearchCV(knn_classifier, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_vec, y_train)

accuracy = best_params.score(X_test_vec, y_test)
print(f"Best estimators: {grid_search.best_params_}, Best Model Accuracy: {accuracy}")