In [19]:
import pandas as pd
import numpy as np
from joblib import dump, load
from pymystem3 import Mystem
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
clf_cheap = load('cheap.joblib')
clf_expensive = load('expensive.joblib')
clf_family = load('family.joblib')
clf_friends = load('friends.joblib')
clf_kids = load('kids.joblib')
clf_romantic = load('romantic.joblib')

In [9]:
df = pd.read_csv('df_reviews.csv')

In [38]:
df_final = pd.read_csv('df_final.csv')

In [7]:
m = Mystem()

def preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    lemmas = m.lemmatize(text)
    text = (''.join(lemmas)).strip()
    return text

In [15]:
df['review_text'] = df['review_text'].apply(preprocess)

In [20]:
X = df['review_text']

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=1)
X = vectorizer.fit_transform(X).toarray()

In [27]:
df['is_cheap'] = clf_cheap.predict(X)
df['is_expensive'] = clf_expensive.predict(X)
df['family'] = clf_family.predict(X)
df['friends'] = clf_friends.predict(X)
df['kids'] = clf_kids.predict(X)
df['romantic'] = clf_romantic.predict(X)

In [35]:
df = df.groupby('place_id').mean().reset_index()

In [36]:
df['is_cheap'] = df['is_cheap'].apply(lambda x: 1 if x >= 0.4 else 0)
df['is_expensive'] = df['is_expensive'].apply(lambda x: 1 if x >= 0.4 else 0)
df['family'] = df['family'].apply(lambda x: 1 if x >= 0.4 else 0)
df['friends'] = df['friends'].apply(lambda x: 1 if x >= 0.4 else 0)
df['kids'] = df['kids'].apply(lambda x: 1 if x >= 0.4 else 0)
df['romantic'] = df['romantic'].apply(lambda x: 1 if x >= 0.4 else 0)

In [39]:
df.head(1)

Unnamed: 0,place_id,rating,is_cheap,is_expensive,family,friends,kids,romantic
0,ChIJ--I-0GytXkERJb2yYopqbZI,4.0,0,0,0,1,0,1


In [42]:
df_final = df_final[['place_id', 'name', 'location_lat', 'location_lng', 'types', 'rating',
       'total_reviews', 'address', 'is_amusement_park', 'is_aquarium',
       'is_art_gallery', 'is_bowling_alley', 'is_library', 'is_movie_theater',
       'is_mosque', 'is_cafe', 'is_church', 'is_night_club', 'is_park',
       'is_point_of_interest', 'is_museum', 'is_restaurant', 'is_zoo']]

In [43]:
df_final.head()

Unnamed: 0,place_id,name,location_lat,location_lng,types,rating,total_reviews,address,is_amusement_park,is_aquarium,...,is_movie_theater,is_mosque,is_cafe,is_church,is_night_club,is_park,is_point_of_interest,is_museum,is_restaurant,is_zoo
0,ChIJQcW8LT2tXkERLI6TYxvjLuk,Кырлай Парк Аттракционов,55.810723,49.100557,"amusement_park,point_of_interest,establishment",4.4,3201,"ул. Односторонка Гривки, 1а, Казань, Респ. Тат...",1,0,...,0,0,0,0,0,1,1,0,0,0
1,ChIJf-EDWfWtXkERPtZ19AJuNEo,ОстровОК Развлечений,55.780915,49.213188,"amusement_park,point_of_interest,establishment",4.1,24,"просп. Победы, 141, Казань, Респ. Татарстан, Р...",1,0,...,0,0,0,0,0,1,1,0,0,0
2,ChIJletSTBetXkERNkGZb55eYco,Лабиринт Страха,55.789572,49.117843,"amusement_park,point_of_interest,establishment",4.2,80,"ул. Баумана, 35, Казань, Респ. Татарстан, Росс...",1,0,...,0,0,0,0,0,1,1,0,0,0
3,ChIJG4vRqG-tXkERqDP1cCu1FA0,Вход В SkyPark,55.798793,49.147235,"amusement_park,point_of_interest,establishment",4.2,6,"Казань, Респ. Татарстан, Россия, 420015",1,0,...,0,0,0,0,0,1,1,0,0,0
4,ChIJKWiBn2auXkERiIykY7qB1nE,Дубрава Горки,55.733131,49.209045,"amusement_park,point_of_interest,establishment",4.7,91,"Респ. Татарстан, Россия, 420138",1,0,...,0,0,0,0,0,1,1,0,0,0


In [44]:
res = df_final.merge(df, on='place_id')

In [47]:
res.to_csv('data.csv')