In [18]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier

In [19]:
data = pd.read_csv('data\preprocessed.csv').dropna()

In [20]:
X = data["Reviews_cleaned"].loc[(data["'American'"]==1)|(data[" 'American'"]==1)]
y = data["Category"].loc[(data["'American'"]==1)|(data[" 'American'"]==1)]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
vectorizer = TfidfVectorizer(lowercase=False)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [22]:
model_logreg = LogisticRegression(random_state=0).fit(X_train, y_train)
y_true_logreg = y_test
y_pred_logreg = model_logreg.predict(X_test)
print('Wyniki dla regresji logistycznej:')
print(classification_report(y_true_logreg, y_pred_logreg))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_logreg, y_pred_logreg)}')
print(f'f1_score: {f1_score(y_true_logreg, y_pred_logreg)}')

Wyniki dla regresji logistycznej:
              precision    recall  f1-score   support

         0.0       0.73      0.66      0.69       170
         1.0       0.73      0.79      0.75       196

    accuracy                           0.73       366
   macro avg       0.73      0.72      0.72       366
weighted avg       0.73      0.73      0.73       366

cohen_kappa_score: 0.44733027301280504
f1_score: 0.7549019607843137


In [23]:
model_knn = KNeighborsClassifier(n_neighbors=20)
model_knn.fit(X_train, y_train)
y_true_knn = y_test
y_pred_knn = model_knn.predict(X_test)
print('Wyniki dla KNN:')
print(classification_report(y_true_knn, y_pred_knn))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_knn, y_pred_knn)}')
print(f'f1_score: {f1_score(y_true_knn, y_pred_knn)}')

Wyniki dla KNN:
              precision    recall  f1-score   support

         0.0       0.63      0.64      0.64       170
         1.0       0.69      0.68      0.68       196

    accuracy                           0.66       366
   macro avg       0.66      0.66      0.66       366
weighted avg       0.66      0.66      0.66       366

cohen_kappa_score: 0.3194985905355965
f1_score: 0.6820512820512821


In [24]:
model_nb = BernoulliNB()
model_nb.fit(X_train, y_train)
y_true_nb = y_test
y_pred_nb = model_nb.predict(X_test)
print('Wyniki dla NB:')
print(classification_report(y_true_nb, y_pred_nb))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_nb, y_pred_nb)}')
print(f'f1_score: {f1_score(y_true_nb, y_pred_nb)}')

Wyniki dla NB:
              precision    recall  f1-score   support

         0.0       0.79      0.49      0.60       170
         1.0       0.67      0.89      0.76       196

    accuracy                           0.70       366
   macro avg       0.73      0.69      0.68       366
weighted avg       0.72      0.70      0.69       366

cohen_kappa_score: 0.38577367205542723
f1_score: 0.761487964989059


In [25]:
data.loc[(data["'American'"]==1)|(data[" 'American'"]==1)].groupby('Category').count()

Unnamed: 0_level_0,Unnamed: 0,City,Rating,Reviews,'Afghani','African','American','Arabic','Argentinean','Armenian',...,'Vegetarian Friendly','Venezuelan','Vietnamese','Welsh','Wine Bar','Xinjiang','Yunnan','Guatemalan','Latvian',Reviews_cleaned
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,849,849,849,849,849,849,849,849,849,849,...,849,849,849,849,849,849,849,849,849,849
1.0,977,977,977,977,977,977,977,977,977,977,...,977,977,977,977,977,977,977,977,977,977
