In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier

In [14]:
data = pd.read_csv('data\preprocessed.csv').dropna()

In [15]:
X = data["Reviews_cleaned"].loc[(data["'European'"]==1)|(data[" 'European'"]==1)]
y = data["Category"].loc[(data["'European'"]==1)|(data[" 'European'"]==1)]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
vectorizer = TfidfVectorizer(lowercase=False)
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [17]:
model_logreg = LogisticRegression(random_state=0).fit(X_train, y_train)
y_true_logreg = y_test
y_pred_logreg = model_logreg.predict(X_test)
print('Wyniki dla regresji logistycznej:')
print(classification_report(y_true_logreg, y_pred_logreg))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_logreg, y_pred_logreg)}')
print(f'f1_score: {f1_score(y_true_logreg, y_pred_logreg)}')

Wyniki dla regresji logistycznej:
              precision    recall  f1-score   support

         0.0       0.80      0.36      0.49       672
         1.0       0.82      0.97      0.89      2025

    accuracy                           0.82      2697
   macro avg       0.81      0.66      0.69      2697
weighted avg       0.81      0.82      0.79      2697

cohen_kappa_score: 0.40024249739843065
f1_score: 0.8885371919511642


In [18]:
model_knn = KNeighborsClassifier(n_neighbors=20)
model_knn.fit(X_train, y_train)
y_true_knn = y_test
y_pred_knn = model_knn.predict(X_test)
print('Wyniki dla KNN:')
print(classification_report(y_true_knn, y_pred_knn))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_knn, y_pred_knn)}')
print(f'f1_score: {f1_score(y_true_knn, y_pred_knn)}')

Wyniki dla KNN:
              precision    recall  f1-score   support

         0.0       0.67      0.34      0.45       672
         1.0       0.81      0.94      0.87      2025

    accuracy                           0.79      2697
   macro avg       0.74      0.64      0.66      2697
weighted avg       0.78      0.79      0.77      2697

cohen_kappa_score: 0.33931186715696093
f1_score: 0.8725445408862493


In [19]:
model_nb = BernoulliNB()
model_nb.fit(X_train, y_train)
y_true_nb = y_test
y_pred_nb = model_nb.predict(X_test)
print('Wyniki dla NB:')
print(classification_report(y_true_nb, y_pred_nb))
print(f'cohen_kappa_score: {cohen_kappa_score(y_true_nb, y_pred_nb)}')
print(f'f1_score: {f1_score(y_true_nb, y_pred_nb)}')

Wyniki dla NB:
              precision    recall  f1-score   support

         0.0       0.81      0.37      0.50       672
         1.0       0.82      0.97      0.89      2025

    accuracy                           0.82      2697
   macro avg       0.82      0.67      0.70      2697
weighted avg       0.82      0.82      0.79      2697

cohen_kappa_score: 0.41383890785535804
f1_score: 0.8906992532247116


In [20]:
data.loc[(data["'European'"]==1)|(data[" 'European'"]==1)].groupby('Category').count()

Unnamed: 0_level_0,Unnamed: 0,City,Rating,Reviews,'Afghani','African','American','Arabic','Argentinean','Armenian','Asian','Australian','Austrian','Balti','Bangladeshi','Bar','Barbecue','Belgian','Brazilian','Brew Pub','British','Cafe','Cajun & Creole','Cambodian','Canadian','Caribbean','Caucasian','Central American','Central Asian','Central European','Chilean','Chinese','Colombian','Contemporary','Croatian','Cuban','Czech','Danish','Delicatessen','Diner','Dutch','Eastern European','Ecuadorean','Ethiopian','European','Fast Food','Filipino','French','Fusion','Gastropub','Georgian','German','Gluten Free Options','Greek','Grill','Halal','Hawaiian','Healthy','Hungarian','Indian','Indonesian','International','Irish','Israeli','Italian','Jamaican','Japanese','Korean','Kosher','Latin','Lebanese','Malaysian','Mediterranean','Mexican','Middle Eastern','Mongolian','Moroccan','Nepali','New Zealand','Norwegian','Pakistani','Persian','Peruvian','Pizza','Polish','Portuguese','Pub','Romanian','Russian','Salvadoran','Scandinavian','Scottish','Seafood','Singaporean','Slovenian','Soups','South American','Southwestern','Spanish','Sri Lankan','Steakhouse','Street Food','Sushi','Swedish','Swiss','Taiwanese','Thai','Tibetan','Tunisian','Turkish','Ukrainian','Uzbek','Vegan Options','Vegetarian Friendly','Venezuelan','Vietnamese','Wine Bar','Afghani','African','Albanian','American','Arabic','Argentinean','Armenian','Asian','Australian','Austrian','Azerbaijani','Balti','Bangladeshi','Bar','Barbecue','Belgian','Brazilian','Brew Pub','British','Burmese','Cafe','Cajun & Creole','Cambodian','Canadian','Caribbean','Caucasian','Central American','Central Asian','Central European','Chilean','Chinese','Colombian','Contemporary','Croatian','Cuban','Czech','Danish','Delicatessen','Diner','Dutch','Eastern European','Ecuadorean','Egyptian','Ethiopian','European','Fast Food','Filipino','French','Fujian','Fusion','Gastropub','Georgian','German','Gluten Free Options','Greek','Grill','Halal','Hawaiian','Healthy','Hungarian','Indian','Indonesian','International','Irish','Israeli','Italian','Jamaican','Japanese','Korean','Kosher','Latin','Lebanese','Malaysian','Mediterranean','Mexican','Middle Eastern','Minority Chinese','Mongolian','Moroccan','Native American','Nepali','New Zealand','Norwegian','Pakistani','Persian','Peruvian','Pizza','Polish','Polynesian','Portuguese','Pub','Romanian','Russian','Salvadoran','Scandinavian','Scottish','Seafood','Singaporean','Slovenian','Soups','South American','Southwestern','Spanish','Sri Lankan','Steakhouse','Street Food','Sushi','Swedish','Swiss','Taiwanese','Thai','Tibetan','Tunisian','Turkish','Ukrainian','Uzbek','Vegan Options','Vegetarian Friendly','Venezuelan','Vietnamese','Welsh','Wine Bar','Xinjiang','Yunnan','Guatemalan','Latvian',Reviews_cleaned
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1
0.0,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361,3361
1.0,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122,10122
