In [1]:
import pandas as pd
import re
import numpy as np
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

RANDOM_SEED = 322
!pip freeze > requirements.txt

#импортировали исходные данные
df = pd.read_csv('main_task.csv')
df_kaggle = pd.read_csv('kaggle_task.csv')


In [2]:
#функции обработки

#считаем % пропусков по каждому признаку
#функция missing_data(data) будет принимать на вход датасет, и определять % пропусков и тип данных
def missing_data(data):
    for i in data.columns:
        md = 100-data[i].isna().value_counts()[0]*100/len(data[i])
        print('% пропусков в',i, md)
    return

#функция конвертации строковых данных в столбце Price Range, на фход идет серия, на выходе числовые значения 
#ghbpyfrf ценовой категории
def price_cat(data):
    if data == '$':
        return 1
    elif data == '$$ - $$$':
        return 2
    elif data == '$$$$':
        return 3

#функция которая получает на вход серию со списком всех кухонь и убирает аартефакты в названиях,
#формируя список 
def resplit(ct):
    ct = ct.replace(", ",'')
    ct = ct.replace("[",'')
    ct = ct.replace("]",'')
    ct = re.split("'(.*?)'",ct)
    ct = list(filter(None, ct))
    return ct

#функция разбивки типов кухонь , нужная для выполенния заданий в модуле
def split_cuisine(ct):
    if re.findall(r"'(.*?)'", ct) == 'Unknown':
        return 'Unknown'
    else:
        return  re.findall(r"'(.*?)'", ct)

#простая функция длинны списка
def len_list(data):
    return len(data)


# функиця разделения даты из отзывов
# тип данных строка, визуально похожая на списки
# необходимо разпарсить данные
# кроме того , в строке есть дата отзыва, ее тоже надо достать
# кол-во дат будет равно кол-ву отзывов

def split_date(ct):
    if re.findall(r'\d\d/\d\d/\d{4}', ct) == 'None':
        return dt.datetime(1900,12,31)
    else:
        return  re.findall(r'\d\d/\d\d/\d{4}', ct)
    
# функция возвращает разницу между первым и последним отзывом в формате datetime
def deltadays(rd):
    if len(rd)>=2:
        a = dt.datetime.strptime(rd[0], '%m/%d/%Y') - dt.datetime.strptime(rd[len(rd)-1], '%m/%d/%Y')
        return a
    else:
        return dt.timedelta(0)

def learn_fit(df, random_state = None):
    #X - данные о ресторанах, Y - целевая переменная
    X = df.drop(['Rating', 'Restaurant_id'], axis = 1)
    y = df['Rating']
    
    # Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
    # выделим 20% данных на валидацию (параметр test_size)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_SEED)
    
    #создаем модель
    regr = RandomForestRegressor(n_estimators=100, verbose=0, n_jobs=-1, random_state=RANDOM_SEED)
    #обучение на тесте
    regr.fit(X_train, y_train)
    
    #используем полученную (обученную модель) для предсказания рейтинга в тестовой сборке
    #полученное значение пишем в отдульную переменную
    y_predict = regr.predict(X_test)
    
    #сравниваем предсказанное значение y_predict с тестовым y_test
    print('MAE Regression:', metrics.mean_absolute_error(y_test,y_predict))
    print('MAE Regression round state:', metrics.mean_absolute_error(y_test,np.round(y_predict*2)/2))
    
    #классификация
    le = LabelEncoder()
    le.fit(y_train)
    classif = RandomForestClassifier(n_estimators = 100, random_state=RANDOM_SEED, n_jobs=-1)
    classif.fit(X_train, le.transform(y_train))
    y_predict = classif.predict(X_test)
    print('MAE Классификация:', metrics.mean_absolute_error(y_test,le.inverse_transform(y_predict)))

In [3]:
missing_data(df)

% пропусков в Restaurant_id 0.0
% пропусков в City 0.0
% пропусков в Cuisine Style 23.207499999999996
% пропусков в Ranking 0.0
% пропусков в Rating 0.0
% пропусков в Price Range 34.715
% пропусков в Number of Reviews 6.357500000000002
% пропусков в Reviews 0.0
% пропусков в URL_TA 0.0
% пропусков в ID_TA 0.0


In [4]:
df.columns

Index(['Restaurant_id', 'City', 'Cuisine Style', 'Ranking', 'Rating',
       'Price Range', 'Number of Reviews', 'Reviews', 'URL_TA', 'ID_TA'],
      dtype='object')

In [5]:
drop_columns = ['City','Cuisine Style', 'Price Range', 'Reviews', 'URL_TA', 'ID_TA']
print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)

Смотрим на предикт с базовыми признаками до очистки и генерации новых

MAE Regression: 0.42992353621640866
MAE Regression round state: 0.4139882541377469
MAE Классификация: 0.48723972237052854


In [6]:
#удаляем признак как не влияющий на качество 

#df.drop(columns=['ID_TA'], inplace=True)
#df_kaggle.drop(columns=['ID_TA'], inplace=True)


In [7]:
# City

df.groupby('City').Rating.agg(['mean','std','count']).sort_values(by=['mean'], ascending=False)

Unnamed: 0_level_0,mean,std,count
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rome,4.230269,0.442161,2078
Athens,4.230096,0.547579,628
Oporto,4.178363,0.594457,513
Krakow,4.146727,0.638973,443
Amsterdam,4.131215,0.567598,1086
Berlin,4.124594,0.644704,2155
Ljubljana,4.112022,0.615521,183
Budapest,4.090074,0.688316,816
Warsaw,4.089409,0.656802,727
Edinburgh,4.088087,0.672541,596


In [8]:
df.head(5)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


In [9]:
#импортируем данные по городам мира, и заберем оттуда только столицы (признак primary)
cities = pd.read_csv('worldcities.csv')
cities = cities[cities['capital']=='primary']

#уберем ненужную информацию и сформируем список столиц мира, хотя нам надо только европу, но все же
cities.drop(['city_ascii', 'lat','lng','capital','id','admin_name'], inplace=True, axis=1)
cities_list=list(cities['city'])

#вводим новый признак Столица, 1 если город столичный, 0 если не столичный
df['capital']=df['City'].apply(lambda x: 1 if x in cities_list else 0 )
df_kaggle['capital']=df['City'].apply(lambda x: 1 if x in cities_list else 0 )
display(df)

drop_columns = ['City','Cuisine Style', 'Price Range', 'Reviews', 'URL_TA','ID_TA']
print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,1
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,1
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,1
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,1
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,0
...,...,...,...,...,...,...,...,...,...,...,...
39995,id_499,Milan,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,4.5,$$ - $$$,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414,0
39996,id_6340,Paris,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,3.5,$$ - $$$,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036,1
39997,id_1649,Stockholm,"['Japanese', 'Sushi']",1652.0,4.5,,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615,1
39998,id_640,Warsaw,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,4.0,$$ - $$$,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838,1


Смотрим на предикт с базовыми признаками до очистки и генерации новых

MAE Regression: 0.40157955088602443
MAE Regression round state: 0.3833956219967966
MAE Классификация: 0.44858515750133476


In [10]:
#попробуем добавить численность населения
city_list=df['City'].unique()
city_list={'Paris':2.148, 'Stockholm':0.975, 'London':8.961, 'Berlin':3.669, 'Munich':1.488, 'Oporto':0.287,
       'Milan':1.1399, 'Bratislava':0.428, 'Vienna':1.921, 'Rome':2.790, 'Barcelona':1.664, 'Madrid':3.334,
       'Dublin':1.173, 'Brussels':1.212, 'Zurich':0.434, 'Warsaw':1.793, 'Budapest':1.750, 'Copenhagen':0.794,
       'Amsterdam':0.860, 'Lyon':0.518, 'Hamburg':1.845, 'Lisbon':0.506, 'Prague':1.335, 'Oslo':0.693,
       'Helsinki':0.655, 'Edinburgh':0.488, 'Geneva':0.201, 'Ljubljana':0.286, 'Athens':0.664,
       'Luxembourg':0.644, 'Krakow':0.779}
df['population'] = df.City.replace(to_replace=city_list)
df_kaggle['population'] = df.City.replace(to_replace=city_list)


In [11]:
#проверяем модель

In [12]:
drop_columns = ['City','Cuisine Style', 'Price Range', 'Reviews', 'URL_TA','ID_TA']
print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)

Смотрим на предикт с базовыми признаками до очистки и генерации новых

MAE Regression: 0.2298718633208756
MAE Regression round state: 0.20480512546716498
MAE Классификация: 0.2218366257341164


In [13]:
#Отлично!!! модель улучшена в 2 раза, играем дальше


In [14]:
#узнаем модальное значение для категории Price Range
#rjnjhsv позже заполним пропуски
print('Модубльное значение для категории Price Range',df['Price Range'].mode())

df['Price Range'].fillna(2, inplace=True, axis=0)
range_replace={'$':1,'$$ - $$$':2,'$$$$':3}
df['Price Range']=df['Price Range'].replace(to_replace=range_replace)

#делаем тоже самое с кэгл
df_kaggle['Price Range']=df_kaggle['Price Range'].replace(to_replace=range_replace)
df_kaggle['Price Range'].fillna(2, inplace=True, axis=0)

display(df_kaggle.sample(15))
missing_data(df)
#ghjdthztv модель , уже не удаляем столбец Price Range т.к. мы его изменили 
drop_columns = ['City','Cuisine Style', 'Reviews', 'URL_TA','ID_TA']
print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)

Модубльное значение для категории Price Range 0    $$ - $$$
dtype: object


Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population
9258,id_9258,Munich,"['Mediterranean', 'European', 'Greek']",600.0,2.0,57.0,"[['Good food and exceptional service', 'Best G...",/Restaurant_Review-g187309-d2289781-Reviews-Te...,d2289781,1,8.961
9963,id_9963,London,"['Caribbean', 'African']",11966.0,2.0,8.0,"[['relaxed atmosphere with great food', 'Food ...",/Restaurant_Review-g186338-d3369764-Reviews-He...,d3369764,1,1.335
639,id_639,Berlin,,3433.0,2.0,,"[['Great regional and seasonal food'], ['06/25...",/Restaurant_Review-g187323-d10054994-Reviews-H...,d10054994,1,2.79
4198,id_4198,Paris,['Vietnamese'],10486.0,1.0,17.0,[['Great place to grab a bowl of Pho on a Sun....,/Restaurant_Review-g187147-d4880549-Reviews-Pa...,d4880549,0,1.845
2707,id_2707,Vienna,"['Bar', 'Pub', 'Italian', 'European']",2982.0,2.0,12.0,"[['Good place', 'Great people watching'], ['10...",/Restaurant_Review-g190454-d1007038-Reviews-Co...,d1007038,1,0.644
7469,id_7469,London,"['Asian', 'Vegetarian Friendly', 'Vegan Option...",2426.0,2.0,254.0,"[['Decent... very decent', 'Wag my wallet'], [...",/Restaurant_Review-g186338-d1019711-Reviews-Wa...,d1019711,1,8.961
3356,id_3356,Luxembourg,"['Italian', 'Mediterranean', 'European', 'Vege...",195.0,2.0,137.0,"[['Really good restaurant', 'Decent restaurant...",/Restaurant_Review-g190356-d1067313-Reviews-Il...,d1067313,1,2.148
3196,id_3196,Milan,"['Chinese', 'Japanese', 'Sushi', 'Asian', 'Fus...",1279.0,2.0,50.0,"[['Once a week'], ['04/30/2017']]",/Restaurant_Review-g187849-d12130862-Reviews-G...,d12130862,1,3.334
48,id_48,Athens,['Greek'],1077.0,2.0,3.0,"[['Thomas'], ['01/06/2016']]",/Restaurant_Review-g189400-d8747125-Reviews-Yo...,d8747125,0,1.664
7753,id_7753,Barcelona,"['Spanish', 'Asian', 'Fusion', 'Eastern Europe...",3286.0,2.0,9.0,"[[], []]",/Restaurant_Review-g187497-d13174128-Reviews-O...,d13174128,1,0.86


% пропусков в Restaurant_id 0.0
% пропусков в City 0.0
% пропусков в Cuisine Style 23.207499999999996
% пропусков в Ranking 0.0
% пропусков в Rating 0.0
% пропусков в Price Range 0.0
% пропусков в Number of Reviews 6.357500000000002
% пропусков в Reviews 0.0
% пропусков в URL_TA 0.0
% пропусков в ID_TA 0.0
% пропусков в capital 0.0
% пропусков в population 0.0
Смотрим на предикт с базовыми признаками до очистки и генерации новых

MAE Regression: 0.22897116924719696
MAE Regression round state: 0.20352375867592098
MAE Классификация: 0.22754938601174587


In [15]:
#ничего не сильно не изменилось, кроме модели с округлением, стало даже чуть хуже
#думаю этот признак можно удалить

In [16]:
#Попробуем поиграть с кухнями

#добавили количество кухонь в ресторане
#cuisine = pd.read_csv('cuisine.csv')
#df['col_cuisine'] = cuisine['col_cuisine'].apply(resplit)
#df['col_cuisine'] = df['col_cuisine'].apply(len)

#drop_columns = ['City','Cuisine Style', 'Price Range', 'Reviews', 'URL_TA','ID_TA']
#print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
#learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)
#mean_cuisine=df.groupby('City').col_cuisine.agg('mean').to_dict()
#mean_cuisine['Stockholm']
#df['Cuisine Style'] = df['Cuisine Style'].map(mean_cuisine)
#df['Cuisine Style'] = df['City'].replace(to_replace=mean_cuisine)
#df_kaggle['Cuisine Style'] = df['City'].replace(to_replace=mean_cuisine)


#df.sample(15)
#missing_data(df)

#drop_columns = ['City','Number of Reviews', 'URL_TA', 'ID_TA', 'Reviews']
#print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
#learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)

#модель ухудшилась, убираем этот признак !!! 

In [17]:
df.head(4)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,2,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,1,2.148
1,id_1535,Stockholm,,1537.0,4.0,2,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,1,0.975
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,3,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,1,8.961
3,id_3456,Berlin,,3458.0,5.0,2,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,1,3.669


In [18]:
#есть небольшое улучшение - изменим и тестовую

#df

#df3=df[['URL_TA','col_cuisine']]

#df4 = pd.merge(df_kaggle, df3, how = 'left')
#display(df4)
#missing_data(df4)
#display(df[df['Restaurant_id']=='id_0'])
#display(df_kaggle[df_kaggle['Restaurant_id']=='id_0'])
#df_kaggle['col_cuisine'] = cuisine['col_cuisine'].apply(resplit)
#df_kaggle['col_cuisine'] = df['col_cuisine'].apply(len)

Смотрим на предикт с базовыми признаками до очистки и генерации новых

MAE Regression: 0.22922744260544584
MAE Regression round state: 0.20197544046983448
MAE Классификация: 0.21238654564869194


Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population,city_restaurants,ranking_norm
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,2,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,1,2.148,4897,1.137431
1,id_1535,Stockholm,,1537.0,4.0,2,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,1,0.975,820,1.87439
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,3,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,1,8.961,5757,0.061317


In [20]:
#отличный результат , повысили еще точность
#сделаем тоже и на kaggle

per_city_dict = dict(df_kaggle['City'].value_counts())
per_city_dict
df_kaggle['city_restaurants'] = df_kaggle.City.map(per_city_dict)
# Добавим признак - нормализованный ранг ranking_norm
df_kaggle['ranking_norm'] = df_kaggle.Ranking / df_kaggle.city_restaurants
df_kaggle.head(3)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population,city_restaurants,ranking_norm
0,id_0,Paris,"['Bar', 'Pub']",12963.0,2.0,4.0,"[[], []]",/Restaurant_Review-g187147-d10746918-Reviews-L...,d10746918,1,2.148,1211,10.704377
1,id_1,Helsinki,"['European', 'Scandinavian', 'Gluten Free Opti...",106.0,2.0,97.0,"[['Very good reviews!', 'Fine dining in Hakani...",/Restaurant_Review-g189934-d6674944-Reviews-Ra...,d6674944,1,0.975,99,1.070707
2,id_2,Edinburgh,['Vegetarian Friendly'],810.0,2.0,28.0,"[['Better than the Links', 'Ivy Black'], ['12/...",/Restaurant_Review-g186525-d13129638-Reviews-B...,d13129638,1,8.961,149,5.436242


In [23]:
#display(df.head(3))
#display(df_kaggle.head(3))


price_in_city_dict = df.groupby('City')['Price Range'].mean().to_dict()
df['Price in City'] = df['City'].map(price_in_city_dict)
df



  # ПОПРОБУЕМ еще немного добавить признаков
    
#missing_data(df_kaggle)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population,city_restaurants,ranking_norm,Price in City
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,2,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,1,2.1480,4897,1.137431,1.930366
1,id_1535,Stockholm,,1537.0,4.0,2,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,1,0.9750,820,1.874390,1.969512
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,3,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,1,8.9610,5757,0.061317,1.859128
3,id_3456,Berlin,,3458.0,5.0,2,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,1,3.6690,2155,1.604640,1.869606
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,2,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,0,1.4880,893,0.695409,1.944009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,id_499,Milan,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,4.5,2,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414,0,1.1399,2133,0.234412,1.842944
39996,id_6340,Paris,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,3.5,2,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036,1,2.1480,4897,1.294874,1.930366
39997,id_1649,Stockholm,"['Japanese', 'Sushi']",1652.0,4.5,2,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615,1,0.9750,820,2.014634,1.969512
39998,id_640,Warsaw,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,4.0,2,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838,1,1.7930,727,0.881706,1.885832


In [22]:
#делаем dummy
display(df.sample(5))
display(df_kaggle.sample(5))
df = pd.get_dummies(df, columns=[ 'City',], dummy_na=True)
df_kaggle = pd.get_dummies(df_kaggle, columns=[ 'City',], dummy_na=True)


Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population,city_restaurants,ranking_norm
32429,id_203,Luxembourg,"['Bar', 'Pub', 'International', 'European', 'B...",204.0,4.0,2,41.0,"[['Good ratio between quality and price', 'Goo...",/Restaurant_Review-g190356-d7278493-Reviews-Os...,d7278493,1,0.644,210,0.971429
10189,id_2019,Paris,"['French', 'Cafe', 'European']",2020.0,4.0,2,286.0,"[['Nice place to dine', 'Great cocktails !'], ...",/Restaurant_Review-g187147-d781957-Reviews-Caf...,d781957,1,2.148,4897,0.412497
339,id_3998,Rome,"['Italian', 'Pizza', 'Mediterranean']",3999.0,4.0,2,137.0,"[['Local restaurant, quite good', 'Lunch and d...",/Restaurant_Review-g187791-d2323613-Reviews-Ri...,d2323613,1,2.79,2078,1.924447
8316,id_5862,Berlin,"['German', 'European']",5864.0,3.5,2,36.0,"[['Average', 'Nice food'], ['03/18/2017', '02/...",/Restaurant_Review-g187323-d1793905-Reviews-Br...,d1793905,1,3.669,2155,2.721114
26646,id_1483,Hamburg,['Vegetarian Friendly'],1485.0,3.5,2,40.0,[['Very nice Family Dinner in a good atmosphe....,/Restaurant_Review-g187331-d8587344-Reviews-We...,d8587344,0,1.845,949,1.564805


Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population,city_restaurants,ranking_norm
8891,id_8891,Lisbon,,2341.0,2.0,3.0,"[[], []]",/Restaurant_Review-g189158-d10047504-Reviews-S...,d10047504,1,3.334,347,6.746398
895,id_895,Berlin,,6342.0,2.0,,"[[], []]",/Restaurant_Review-g187323-d9743120-Reviews-Fu...,d9743120,0,1.1399,550,11.530909
2014,id_2014,Hamburg,"['Swiss', 'European']",791.0,2.0,9.0,"[['A great invention - Macaron ice cream!', 'O...",/Restaurant_Review-g187331-d8741635-Reviews-Jo...,d8741635,0,1.845,237,3.337553
2734,id_2734,London,"['Mediterranean', 'Spanish']",4547.0,2.0,88.0,"[['Tapas.', 'Very Bad & Very Expensive'], ['10...",/Restaurant_Review-g186338-d7596031-Reviews-Co...,d7596031,1,1.921,1436,3.166435
5792,id_5792,Berlin,"['Japanese', 'Sushi', 'Asian', 'Vietnamese', '...",2279.0,2.0,10.0,"[['Great!', 'Amazing sushi'], ['08/20/2017', '...",/Restaurant_Review-g187323-d12151346-Reviews-S...,d12151346,1,3.334,550,4.143636


In [23]:
drop_columns = [ 'Cuisine Style', 'Price Range', 'Reviews', 'URL_TA','ID_TA']
print('Смотрим на предикт с базовыми признаками до очистки и генерации новых\n')
learn_fit(df.drop(columns = drop_columns).dropna(), random_state=322)

Смотрим на предикт с базовыми признаками до очистки и генерации новых

MAE Regression: 0.22606460224239186
MAE Regression round state: 0.2016550987720235
MAE Классификация: 0.21398825413774694


In [26]:
# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
df['sample'] = 1 # помечаем где у нас трейн
df_kaggle['sample'] = 0 # помечаем где у нас тест
df_kaggle['Rating'] = 0 # в тесте у нас нет значения Rating, мы его должны предсказать, по этому пока просто заполняем нулями

data = df.append(df_kaggle, sort=False).reset_index(drop=True) # объединяем

In [28]:
data.head(3)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,...,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan,sample
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,2.0,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,1,...,0,1,0,0,0,0,0,0,0,1
1,id_1535,,1537.0,4.0,2.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,1,...,0,0,0,0,1,0,0,0,0,1
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,3.0,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,1,...,0,0,0,0,0,0,0,0,0,1


In [30]:
df_preproc = data
df_preproc.sample(10)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,...,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan,sample
28676,id_1759,['American'],1773.0,4.0,1.0,43.0,"[['Great options, tasty burger', 'Poor value']...",/Restaurant_Review-g187309-d8037030-Reviews-Bu...,d8037030,0,...,0,0,0,0,0,0,0,0,0,1
33722,id_3513,"['European', 'Portuguese', 'Mediterranean']",3517.0,3.0,2.0,67.0,[['Watch for aggressive charging..... Fish so....,/Restaurant_Review-g189158-d10251651-Reviews-R...,d10251651,1,...,0,0,0,0,0,0,0,0,0,1
11795,id_14804,['Indian'],14815.0,3.0,2.0,,"[[], []]",/Restaurant_Review-g186338-d5795647-Reviews-Vi...,d5795647,1,...,0,0,0,0,0,0,0,0,0,1
17269,id_5826,"['French', 'Cafe', 'British', 'Delicatessen']",5835.0,4.5,2.0,30.0,"[['The place for cooked breakfast', 'Fabulous ...",/Restaurant_Review-g186338-d5984281-Reviews-Le...,d5984281,1,...,0,0,0,0,0,0,0,0,0,1
40608,id_608,,2495.0,0.0,2.0,,"[[], []]",/Restaurant_Review-g187331-d5521656-Reviews-Ca...,d5521656,1,...,0,0,0,0,0,0,0,0,0,0
14058,id_12043,,12045.0,3.0,2.0,40.0,"[['Decent fast food', 'Nice late night beer so...",/Restaurant_Review-g187147-d7282965-Reviews-Qu...,d7282965,1,...,0,1,0,0,0,0,0,0,0,1
3704,id_174,"['Seafood', 'European', 'Scandinavian']",175.0,5.0,2.0,25.0,"[['Excellent fish soup and cured fish', 'Great...",/Restaurant_Review-g189934-d8418176-Reviews-Ka...,d8418176,1,...,0,0,0,0,0,0,0,0,0,1
9540,id_81,"['European', 'Portuguese']",82.0,4.5,2.0,45.0,"[['A must !!', 'Excellent food a bit expensive...",/Restaurant_Review-g190356-d8078278-Reviews-Re...,d8078278,1,...,0,0,0,0,0,0,0,0,0,1
302,id_9572,['French'],9574.0,4.0,2.0,6.0,"[['Welcoming French restaurant'], ['11/23/2017']]",/Restaurant_Review-g187147-d4293803-Reviews-La...,d4293803,1,...,0,1,0,0,0,0,0,0,0,1
10260,id_2336,"['European', 'French', 'Healthy', 'Vegetarian ...",2342.0,3.5,2.0,59.0,"[['Decent', 'Great kid friendly place'], ['11/...",/Restaurant_Review-g188590-d8001824-Reviews-Le...,d8001824,1,...,0,0,0,0,0,0,0,0,0,1


In [31]:
# Теперь выделим тестовую часть
train_data = df_preproc.query('sample == 1').drop(['sample'], axis=1)
test_data = df_preproc.query('sample == 0').drop(['sample'], axis=1)

y = train_data.Rating.values            # наш таргет
X = train_data.drop(['Rating'], axis=1)

display(X)

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,capital,population,...,City_Oporto,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan
0,id_5569,"['European', 'French', 'International']",5570.0,2.0,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,1,2.1480,...,0,0,1,0,0,0,0,0,0,0
1,id_1535,,1537.0,2.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,1,0.9750,...,0,0,0,0,0,1,0,0,0,0
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,3.0,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,1,8.9610,...,0,0,0,0,0,0,0,0,0,0
3,id_3456,,3458.0,2.0,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,1,3.6690,...,0,0,0,0,0,0,0,0,0,0
4,id_615,"['German', 'Central European', 'Vegetarian Fri...",621.0,2.0,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,0,1.4880,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,id_499,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,2.0,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414,0,1.1399,...,0,0,0,0,0,0,0,0,0,0
39996,id_6340,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,2.0,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036,1,2.1480,...,0,0,1,0,0,0,0,0,0,0
39997,id_1649,"['Japanese', 'Sushi']",1652.0,2.0,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615,1,0.9750,...,0,0,0,0,0,1,0,0,0,0
39998,id_640,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,2.0,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838,1,1.7930,...,0,0,0,0,0,0,0,1,0,0


In [42]:
df['Price Range'].mode()

0    2
dtype: int64

In [32]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [35]:
# проверяем
test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

((10000, 45), (40000, 45), (40000, 44), (32000, 44), (8000, 44))

In [36]:
#ML





# Создаём модель (НАСТРОЙКИ НЕ ТРОГАЕМ)
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

# Обучаем модель на тестовом наборе данных
model.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = model.predict(X_test)

#вставить округление 

#потом мае

#y_pred = model.predict(X_test)

#пото гистограма

# в RandomForestRegressor есть возможность вывести самые важные признаки для модели
plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')




ValueError: could not convert string to float: 'id_4098'