In [4]:
import json
import pandas as pd
import imblearn

In [5]:
with open("finalData2.json", "r") as f:
    data = json.load(f)

In [6]:
# each row of DF:
# - restaurant
# - text of all reviews together?
# - columns for each ambiance label, 1 or 0

df = pd.DataFrame(columns=['restaurant', 'reviews', 'touristy', 'classy', 'romantic', 'casual', 
                          'hipster', 'divey', 'intimate', 'trendy', 'upscale'])
all_empty = pd.DataFrame(columns=['restaurant', 'reviews', 'touristy', 'classy', 'romantic', 'casual', 
                          'hipster', 'divey', 'intimate', 'trendy', 'upscale'])

In [7]:
import nltk
from nltk.corpus import stopwords
stop_words = ["floor", "restaurant", "owner", "food", "counter", "windy", "radius", "ingredients", "hours", "person", 
             "review", "people", "everybody", "eat", "ate", "plate", "plated", "order", "ordered", "today"]
stop_words = stopwords.words("english") + stop_words

In [8]:
import re

In [9]:
for restaurant, restaurant_dict in data['BOSTON'].items():
    ambiances = restaurant_dict['ambience']
    review_texts = [r['text'] for r in restaurant_dict['reviews']]
    review_combined = '\n'.join(review_texts)
    new_row = dict()
    new_row['restaurant'] = restaurant
    new_row['reviews'] = review_combined
    for label in ['touristy', 'classy', 'romantic', 'casual', 'hipster', 'divey', 'intimate', 'trendy', 'upscale']:
        if label in ambiances:
            new_row[label] = 1
        else:
            new_row[label] = 0
    if len(ambiances) == 0:
        all_empty = all_empty.append(new_row, ignore_index=True)
    else:
        df = df.append(new_row, ignore_index=True)

In [27]:
manual = pd.read_csv("manual_data.csv")
df = pd.concat([df, manual, manual, manual], axis=0, ignore_index=True)

In [28]:
df

Unnamed: 0,restaurant,reviews,touristy,classy,romantic,casual,hipster,divey,intimate,trendy,upscale
0,Legal Sea Foods,legal since legal serving fresh started fish m...,1,0,0,0,0,0,0,0,0
1,Amrheins,seated within mins upon arrival big menu posit...,0,0,0,1,0,0,0,0,0
2,Fiouna's Persian Fusion Cuisine,one star like writing line el pelon super said...,0,0,0,1,0,0,0,0,0
3,Code 10,reason giving place customer service delivery ...,0,0,0,1,0,0,0,0,0
4,Shenannigans Bar,pay cover go bar pigs one flying pigs lands co...,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1385,manual,this bar has very good beer,0,0,0,1,0,1,0,0,0
1386,manual,this is a good dive restaurant,0,0,0,0,0,1,0,0,0
1387,manual,this is a divey restaurant,0,0,0,0,0,1,0,0,0
1388,manual,this restaurant is very trendy,0,0,0,0,0,0,0,1,0


In [29]:
df['reviews'] = df['reviews'].apply(lambda x: " ".join([t.lower() for t in x.split() if t.lower() not in stop_words and bool(re.match(r"^[a-zA-Z]+$", t))]))

In [30]:
for col in ['touristy', 'classy', 'romantic', 'casual', 'hipster', 'divey', 'intimate', 'trendy', 'upscale']:
    df[col] = df[col].astype(int)
    all_empty[col] = all_empty[col].astype(int)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
vec = TfidfVectorizer()

In [31]:
X = df['reviews']
Y = df[['touristy', 'classy', 'romantic', 'casual', 'hipster', 'divey', 'intimate', 'trendy', 'upscale']]
X_features = pd.DataFrame(vec.fit_transform(X).toarray())

In [32]:
len(vec.get_feature_names())

17054

In [33]:
names = vec.get_feature_names()

In [34]:
# Xtrain_features['stars'] = X_train['stars'].reset_index()['stars']
# Xtrain_features['price'] = X_train['price'].reset_index()['price']
# Xtest_features['stars'] = X_test['stars'].reset_index()['stars']
# Xtest_features['price'] = X_test['price'].reset_index()['price']

In [35]:
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import balanced_accuracy_score, make_scorer
# param_grid = {'C': [0.1, 1, 10, 100], 
#               'gamma': [1, 0.1, 0.01, 0.001],
#               'kernel': ['rbf', 'linear']} 
# param_grid = {'C': [0.1], 
#               'gamma': [1],
#               'kernel': ['rbf']} 
# param_grid = {'C': [1, 0.5, 0.25, 0.1, 0.01, 0.001]}
# balanced_scorer = make_scorer(balanced_accuracy_score)

models = dict()
lr = LogisticRegression(C=1)
# grid = GridSearchCV(lr, param_grid, refit = True, verbose = 0, scoring="f1_weighted")
for col in ['touristy', 'classy', 'romantic', 'casual', 'hipster', 'divey', 'intimate', 'trendy', 'upscale']:
    X = X_features
    y = Y[col]
    print("Label: ", col)
    X_sm, y_sm = SMOTE(random_state=42).fit_resample(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)
    model = lr
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    models[col] = model
    importance = model.coef_[0]
    importances = sorted([(i, v) for i, v in enumerate(importance)], key=lambda x: -x[1])
    for i,v in importances[:10]:
        print(f'Feature: {names[i]}, Score: {v}')
    print("")

Label:  touristy
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       412
           1       1.00      1.00      1.00       402

    accuracy                           1.00       814
   macro avg       1.00      1.00      1.00       814
weighted avg       1.00      1.00      1.00       814

[[410   2]
 [  0 402]]
Feature: toursity, Score: 4.63811920788879
Feature: clam, Score: 4.0130204057029175
Feature: things, Score: 2.839765007176623
Feature: chowder, Score: 2.747198355742684
Feature: lobster, Score: 2.630380275270428
Feature: new, Score: 2.541132241484878
Feature: quincy, Score: 2.525453907889414
Feature: bar, Score: 2.0898294747252706
Feature: nespresso, Score: 1.8059457993652168
Feature: market, Score: 1.7047277000988292

Label:  classy
              precision    recall  f1-score   support

           0       0.75      0.83      0.79       253
           1       0.81      0.73      0.77       256

    accuracy                   

In [None]:
all_empty_features = pd.DataFrame(vec.transform(all_empty['reviews']).toarray())

In [None]:
all_empty

In [None]:
for index, row in all_empty[df['classy'] == 1].iterrows():
    print("Restaurant: " + row['restaurant'])
    print(row['reviews'])
    print("\n")