In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('data\preprocessed.csv').dropna()

In [3]:
X = data["Reviews_cleaned"]
y = data["Category"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
vectorizer = TfidfVectorizer(lowercase=False)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [5]:
model_logreg = LogisticRegression(random_state=0).fit(X_train, y_train)
y_true_logreg = y_test
y_pred_logreg = model_logreg.predict(X_test)
print('Wyniki dla regresji logistycznej:')
print(classification_report(y_true_logreg, y_pred_logreg))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_logreg, y_pred_logreg)}')
print(f'f1_score: {f1_score(y_true_logreg, y_pred_logreg)}')

Wyniki dla regresji logistycznej:
              precision    recall  f1-score   support

         0.0       0.80      0.47      0.59      2485
         1.0       0.80      0.95      0.87      5620

    accuracy                           0.80      8105
   macro avg       0.80      0.71      0.73      8105
weighted avg       0.80      0.80      0.78      8105

cohen_kappa_score: 0.47045205529298084
f1_score: 0.868189233278956


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
model_knn = KNeighborsClassifier(n_neighbors=20)
model_knn.fit(X_train, y_train)
y_true_knn = y_test
y_pred_knn = model_knn.predict(X_test)
print('Wyniki dla KNN:')
print(classification_report(y_true_knn, y_pred_knn))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_knn, y_pred_knn)}')
print(f'f1_score: {f1_score(y_true_knn, y_pred_knn)}')

Wyniki dla KNN:
              precision    recall  f1-score   support

         0.0       0.65      0.45      0.53      2485
         1.0       0.79      0.89      0.84      5620

    accuracy                           0.76      8105
   macro avg       0.72      0.67      0.68      8105
weighted avg       0.74      0.76      0.74      8105

cohen_kappa_score: 0.37324733193895265
f1_score: 0.8357339105819667


In [7]:
model_nb = BernoulliNB()
model_nb.fit(X_train, y_train)
y_true_nb = y_test
y_pred_nb = model_nb.predict(X_test)
print('Wyniki dla NB:')
print(classification_report(y_true_nb, y_pred_nb))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_nb, y_pred_nb)}')
print(f'f1_score: {f1_score(y_true_nb, y_pred_nb)}')

Wyniki dla NB:
              precision    recall  f1-score   support

         0.0       0.80      0.46      0.58      2485
         1.0       0.80      0.95      0.87      5620

    accuracy                           0.80      8105
   macro avg       0.80      0.70      0.72      8105
weighted avg       0.80      0.80      0.78      8105

cohen_kappa_score: 0.46113858717212675
f1_score: 0.8672623883021934
