It is necessary to build a model that defines dishonest hotels that cheat rating themselves. If the model's predictions are very different from the actual result, then perhaps the hotel is behaving dishonestly, and it is worth checking.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# importing libraries for visualizationс
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import plotly.express as px

# download a special convevnien tool for splitting the fataset:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor # Tool for creating and training a model
from sklearn import metrics # tool for evaluating model accuracy
from sklearn.model_selection import cross_val_score
import category_encoders as ce

import re
# regular expression - to search for similar expressions
import optuna
#Tool for the selection of hyperparameters
from geopy.distance import geodesic 
# Geopy librery using to find distance

from textblob import TextBlob, Blobber
from textblob.sentiments import NaiveBayesAnalyzer
# library for processing textual data

from sklearn.pipeline import Pipeline
import joblib


In [None]:
RANDOM_SEED = 42

hotels_train.csv

https://drive.google.com/file/d/16Xw1pwpg0lPUVeFchcxQRbDgj7e2NUZ-/view?usp=share_link

hotels_test.csv

https://drive.google.com/file/d/1ha76l6i31Oq1bcik74lruD6EYYlWiggI/view?usp=share_link

In [None]:
df_train = pd.read_csv('hotels_train.csv') # dataset for training
df_test = pd.read_csv('hotels_test.csv') # dataset for prediction

In [None]:
df_train.head(2)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097


## Data cleaning

#### Removing duplicates

We identify and remove duplicates from the dataset for training.

In [None]:
mask = df_train.duplicated()
data_duplicates = df_train[mask]
print(f'number of duplicates found: {data_duplicates.shape[0]}')
df_train = df_train.drop_duplicates()

number of duplicates found: 307


In [None]:
#For the correct processing of signs, we combine the train and 
# the test into one dataset

df_train['sample'] = 1 # we mark where we have a train and test
df_test['sample'] = 0 # 
df_test['reviewer_score'] = 0 # in the test,we don't have a reviewer_score, 
                                #we have to predict it, so for now we just fill it with zeros

data = df_test.append(df_train, sort=False).reset_index(drop=True)

  data = df_test.append(df_train, sort=False).reset_index(drop=True)


#### Number of zero values 

We determine and replace the number of zero values

In [None]:
data.isnull().mean()

hotel_address                                 0.00000
additional_number_of_scoring                  0.00000
review_date                                   0.00000
average_score                                 0.00000
hotel_name                                    0.00000
reviewer_nationality                          0.00000
negative_review                               0.00000
review_total_negative_word_counts             0.00000
total_number_of_reviews                       0.00000
positive_review                               0.00000
review_total_positive_word_counts             0.00000
total_number_of_reviews_reviewer_has_given    0.00000
tags                                          0.00000
days_since_review                             0.00000
lat                                           0.00634
lng                                           0.00634
sample                                        0.00000
reviewer_score                                0.00000
dtype: float64

In [None]:
# Replace the missing "lat" and "ing" values with 0. 

values = {
    'lat':0,
    'lng':0
}

data = data.fillna(values)

## Features enginering

### Seasons

In [None]:
data_seasons = data.copy()

In [None]:
# Changing the date format to datetime

data_seasons['review_months'] = pd.to_datetime(data_seasons['review_date']).dt.month

In [None]:
# Let's save the feature separately for further selection of features

data_seasons['review_months'].to_csv('intermediate_calculations/data_months_enc.csv', index=False, sep=',')

### Distancte

I assume that the hotel's rating may be influenced by the location. Let's try to create signs.

In [None]:
# Extracting the country name from the addres.

data_distance = data.copy()

# Use a regex to search for data
data_distance['hotel_country'] =  data_distance['hotel_address'].str.findall(r'\w+').str[-1]
data_distance['hotel_city'] =  data_distance['hotel_address'].str.findall(r'\w+').str[-2]

# English numbers are different.
data_distance.loc[data_distance['hotel_city'] == 'United','hotel_country'] = "United Kingdom"
data_distance.loc[data_distance['hotel_country'] == 'United Kingdom', 'hotel_city'] = data_distance['hotel_address'].str.findall(r'\w+').str[-5]

In [None]:
# Uploading a dataset with geodata of European cities
geo_data = pd.read_csv('https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/geonames-all-cities-with-a-population-1000/exports/csv?lang=en&facet=facet(name%3D%22cou_name_en%22%2C%20disjunctive%3Dtrue)&refine=timezone%3A%22Europe%22&timezone=Europe%2FBerlin&use_labels=true&csv_separator=%3B', sep=';')

In [None]:
# Find the coordinates of the city centers.

# geodata is filtered by country to exclude duplication of city names, 
# and then filtered by city
hotel_country_list = list(data_distance.hotel_country.value_counts().index)
geo_data_country = geo_data[geo_data['LABEL EN'].isin(hotel_country_list)]
hotel_city_list = list(data_distance.hotel_city.value_counts().index)
geo_data_city = geo_data_country[geo_data_country.Name.isin(hotel_city_list)]

# Creating a dataframe with coordinates
city_coordinates = geo_data_city[['Name','Coordinates']].reset_index()
city_coordinates[['lat_city','lng_city']] = city_coordinates['Coordinates'].str.split(',', 1 , expand= True)
city_coordinates = city_coordinates.drop(['index','Coordinates'], axis=1)

# merge dataframes
data_distance = data_distance.merge(
    city_coordinates,
    left_on = "hotel_city",
    right_on = "Name",
    how='left'
)

In [None]:
# Find the distance from the city center to the hotel.

# Creating a separete temporary column with all coordinates
data_distance['coordinates'] = (data_distance['lat'].astype(str) +','+ data_distance['lng'].astype(str) +','+ data_distance['lat_city'].astype(str) +','+ data_distance['lng_city'].astype(str)).str.split(',')

#Geopy librery using to find distance to the city center

def geodesic_func (coordinates):
    hotel = (coordinates[0], coordinates[1])
    city_center = (coordinates[2], coordinates[3])
    return geodesic(hotel,city_center).kilometers
   
    
data_distance['distance'] = data_distance['coordinates'].apply(lambda x: geodesic_func(x))

In [None]:
#Find outliers in the value of 'distance'.

# The spread of the values us determined to find out outliers.
# Due to missing data, some columns have a huge distance to the center of their cities 
data_distance['distance'].describe()

count    515431.000000
mean         37.955103
std         435.972996
min           0.030637
25%           1.317636
50%           2.574862
75%           4.165234
max        5570.491043
Name: distance, dtype: float64

In [None]:
#Since the data are not distributed normally,
# a z-deviation method with preliminary logarithm is needed to determine outliers

def outliers_z_score(data, feature, log_scale=False, left=3, right=3):
    if log_scale:
        x = np.log(data[feature]+1)
    else:
        x = data[feature]
     
    mu = x.mean()
    sigma = x.std()
    lower_bound = mu - left * sigma
    upper_bound = mu + right * sigma
    outliers = data[(x < lower_bound) | (x > upper_bound)]
    cleaned = data[(x > lower_bound) & (x < upper_bound)]
    return outliers, cleaned

outliers, cleaned = outliers_z_score(data_distance, 'distance', log_scale=True,)
print(f'The number of outliers by the z-deviation method: {outliers.shape[0]}')
print(f'The resulting number of records: {cleaned.shape[0]}')

The number of outliers by the z-deviation method: 3268
The resulting number of records: 512163


In [None]:
# Replace the outliers with the median

data_distance.loc[outliers.index,['distance']] = data_distance['distance'].median()
data_distance_enc = data_distance['distance'].apply(lambda x:round(x,1))

In [None]:
# Let's save the data set separately
data_distance_enc.to_csv('intermediate_calculations/data_distance_enc.csv', index=False, sep=',')

### Tags

Tags can carry a lot of rating information
We highlight the tags and their number.

In [None]:
data_tags = data.copy()

In [None]:
# Let's create a special function that creates a column with a tag and marks a feature in it, if feature is in the cell.

def encounter_funk_columns (data,column,key_list,prefix):
    def encounter_funk (x, tag):
        if tag in x:
            return 1
        else:
            return 0
    tmp = pd.DataFrame()
    for tag in key_list:
        tmp[prefix+tag] = data[column].apply(lambda x: encounter_funk(x,tag))
    return tmp

In [None]:
# This function creates a list words in tags whose length greater than 2

def tag_key_words_funk(x):
    list_key_words = []
    for i in x:
        if len(str(i)) >= 3:
            list_key_words.append(i.strip())    
    return list_key_words

In [None]:
# Let's explore the feature
# tags have a similar structure

display(data_tags['tags'][0])
display(data_tags['tags'][1])
display(data_tags['tags'][2])
       

"[' Leisure trip ', ' Couple ', ' Double Room ', ' Stayed 2 nights ']"

"[' Leisure trip ', ' Group ', ' Triple Room ', ' Stayed 1 night ']"

"[' Business trip ', ' Solo traveler ', ' Twin Room ', ' Stayed 1 night ', ' Submitted from a mobile device ']"

In [None]:
# let's create a function that splits the tag column

def get_tags(tags_string):
    
    # we clear the tag string, split it and put on the tag list 
    tags_string = tags_string.replace('[', '').replace(']', '')
    tags_string = tags_string.replace("' ", '').replace(" '", '')
    tags_list = tags_string.split(',')
    tags_list = list(map(lambda x: x.strip(), tags_list))

    # The dictionary is needed for encoding
    reviewer_types = {'Solo traveler': 1,
                      'Couple': 2,
                      'Travelers with friends': 3,
                      'Family with young children': 4,
                      'Family with older children': 5,
                      'Group': 6}

    # Creating default values
    tag_from_mobile = 0
    tag_with_pet = 0
    tag_leisure_trip = 1
    tag_reviewer_type = 0
    tag_count_nights = 0
    tag_room_type = ''
    
    # We define and encode the tag columns
    for teg in list(tags_list):
        if teg == 'Submitted from a mobile device':
            tag_from_mobile = 1
            tags_list.remove(teg)

        if teg == 'With a pet':
            tag_with_pet = 1
            tags_list.remove(teg)

        if teg == 'Business trip':
            tag_leisure_trip = 0
            tags_list.remove(teg)

        if teg == 'Leisure trip':
            tags_list.remove(teg)

        if teg in reviewer_types.keys():
            tag_reviewer_type = reviewer_types[teg]
            tags_list.remove(teg)

        if re.fullmatch(r'Stayed\s\d+\snight\w?', teg):
            tag_count_nights = [float(s) for s in re.findall(r'-?\d+\.?\d*', teg)][0]           
            tags_list.remove(teg)
            
        if len(tags_list) > 0:
            tag_room_type = tags_list[0]

    return tag_from_mobile, tag_with_pet, tag_leisure_trip, \
        tag_reviewer_type, tag_count_nights, tag_room_type

# We create DataFrame object and add values on it
new_data_tags = pd.DataFrame()

new_data_tags[['tag_from_mobile',
               'tag_with_pet',
               'tag_leisure_trip',
               'tag_reviewer_type',
               'tag_count_nights',
               'tag_room_type']
              ] = pd.DataFrame(data_tags['tags'].apply(get_tags).to_list(), index=data_tags.index)

# let's see how many unique values are in each columns
new_data_tags[['tag_from_mobile', 'tag_with_pet', 'tag_leisure_trip', 
        'tag_reviewer_type', 'tag_count_nights', 'tag_room_type']].nunique()

tag_from_mobile         2
tag_with_pet            2
tag_leisure_trip        2
tag_reviewer_type       6
tag_count_nights       32
tag_room_type        2394
dtype: int64

In [None]:
# Let's try explore room types
new_data_tags.tag_room_type.value_counts()[:20]

Double Room                     35207
Standard Double Room            32247
Superior Double Room            31361
Deluxe Double Room              24812
Double or Twin Room             22393
Standard Double or Twin Room    17479
Classic Double Room             16877
Superior Double or Twin Room    13570
2 rooms                         12384
Standard Twin Room               9745
Single Room                      9668
Twin Room                        8320
Executive Double Room            6425
Classic Double or Twin Room      6100
Superior Twin Room               6054
Deluxe Double or Twin Room       5996
Club Double Room                 5908
Queen Room                       5472
Deluxe King Room                 5357
Superior Queen Room              4897
Name: tag_room_type, dtype: int64

In [None]:
new_data_tags.tag_room_type.apply(lambda x: x.split()).explode().value_counts()[:20]

Room         466822
Double       304539
Twin         134933
or            91768
Standard      87328
Superior      86082
Deluxe        65867
with          45292
King          36876
Classic       35837
Queen         30158
Single        27138
2             24846
Executive     19536
View          17539
Suite         17257
Guest         13813
rooms         12926
1             11812
Bed           11190
Name: tag_room_type, dtype: int64

In [None]:
# creating the 'tag_view_room' column
new_data_tags['tag_view_room'] = new_data_tags['tag_room_type'].apply(
    lambda x: 1 if ('View' in x)|('Panoramic' in x) else 0)

# creating a function that determines the type of room
def get_room_type(tag):
    room_types = ['without Window', 'Guestroom', 'Classic', 'Single', 
                  'Standard', 'Superior', 'Comfort', 'Club', 'Suite', 
                  'Deluxe', 'King', 'Premier']

    for rt in room_types:
        if rt in tag:
            return rt
    return 'Other'

# creating a function that determines the double room
def get_double_type(tag):
    if 'Double' in tag or 'Twin' in tag:
        return 1
    else:
        return 0 
    
new_data_tags['doble_type'] = new_data_tags['tag_room_type'].apply(get_double_type)
new_data_tags['room_type'] = new_data_tags['tag_room_type'].apply(get_room_type)

# Encoding columns
encoder_r = ce.OrdinalEncoder(cols=['room_type'])
tag_rooms_bin = encoder_r.fit_transform(new_data_tags['room_type']).rename(columns={'room_type':'room_type_'})
new_data_tags = pd.concat([new_data_tags, tag_rooms_bin], axis=1).drop(['tag_room_type','room_type'],axis=1)
new_data_tags['tag_count_nights'] = new_data_tags['tag_count_nights'].astype('Int8')

In [None]:
# Saving the feutures

new_data_tags.to_csv('intermediate_calculations/new_data_tags.csv', index=False)

### Tags count

Perhaps the number of tags also affects the rating

In [None]:
data_tags['tags_count'] = data_tags['tags'].apply(lambda x: len(x))
data_tags_count = data_tags['tags_count'] 
# Let's save the data set separately
data_tags_count.to_csv('intermediate_calculations/data_tags_count.csv', index=False, sep=',')

### Phrases polarity

A TextBlob library can determine the emocional polarity of phrases 

In [None]:
phrases_sentiment = data

In [None]:
def get_polarity(text):
    blob = TextBlob(text)
    return blob.sentiment_assessments.polarity

phrases_sentiment['negative_phrases_polarity'] = phrases_sentiment['negative_review'].apply(get_polarity)
phrases_sentiment['positive_phrases_polarity'] = phrases_sentiment['positive_review'].apply(get_polarity)

In [None]:
phrases_sentiment['negative_phrases_polarity'] = phrases_sentiment['negative_phrases_polarity'].apply(lambda x: round(x,2))
phrases_sentiment['positive_phrases_polarity'] = phrases_sentiment['positive_phrases_polarity'].apply(lambda x: round(x,2))

In [None]:
data_phrases_sentiment = phrases_sentiment[['negative_phrases_polarity','positive_phrases_polarity']]
data_phrases_sentiment.to_csv('intermediate_calculations/data_phrases_sentiment.csv', index=False, sep=',')

### Reviews words count

Let's try adding the functions "total number of words" and "positive proportion of words".

In [None]:
reviews_words_count = data.copy()

reviews_words_count['review_total_word_counts'] = \
    reviews_words_count['review_total_positive_word_counts'] + \
    reviews_words_count['review_total_negative_word_counts']


reviews_words_count['review_positive_word_proportion'] = reviews_words_count.apply(
    lambda row: 0.5 if row['review_total_word_counts'] == 0 else \
    row['review_total_positive_word_counts'] / row['review_total_word_counts'], axis=1)


reviews_words_count['review_positive_word_proportion'] = \
    reviews_words_count['review_positive_word_proportion'].apply(lambda x: round(x,2))
reviews_words_count = reviews_words_count[['review_positive_word_proportion','review_total_word_counts']]

reviews_words_count.to_csv('intermediate_calculations/reviews_words_count.csv', index=False)

### Reviews words

We are looking the most frequently encountered nouns and adjectives and creating blinc signs

In [None]:
data_reviews_words = data.copy()

In [None]:
# We use textblob library to find nouns and adjective and make key words list
from textblob import TextBlob
def review_key_words_funk_noun(x):
    list_key_words = []
    for i in x:
        blob = TextBlob(str(i))
        blob_text = blob.tags
        if (blob_text[0][1] == 'NN') and (len(blob_text[0][0]) > 2):
            list_key_words.append(blob_text[0][0].lower())
    return list_key_words

def review_key_words_funk_JJ(x):
    list_key_words = []
    for i in x:
        blob = TextBlob(str(i))
        blob_text = blob.tags
        if (blob_text[0][1]  == 'JJ') and (len(blob_text[0][0]) > 2):
            list_key_words.append(blob_text[0][0].lower())
    return list_key_words

In [None]:
# let's do the words of the review separately 
data_reviews_words['positive_review'] = \
    data_reviews_words['positive_review'].apply(lambda x: x.lower().split())
data_reviews_words['negative_review'] = \
    data_reviews_words['negative_review'].apply(lambda x: x.lower().split())

In [None]:
# create features with positive and negative words in the review cell
list_p_key_words_noun =\
    review_key_words_funk_noun(data_reviews_words['positive_review'].explode().value_counts().keys())
list_n_key_words_noun = \
    review_key_words_funk_noun(data_reviews_words['negative_review'].explode().value_counts().keys())

In [None]:
list_p_key_words_jj = \
    review_key_words_funk_JJ(data_reviews_words['positive_review'].explode().value_counts().keys())
list_n_key_words_jj = \
    review_key_words_funk_JJ(data_reviews_words['negative_review'].explode().value_counts().keys())

#### Perhaps the lines with the highest and lowest average rating may contain unique and important words in the reviews

In [None]:
data_reviews_words = data.copy()

In [None]:
# the first quantile contain lines with the highest and lowest average rating.
quantile_025 = data_reviews_words['average_score'].value_counts().quantile(0.25)
high_and_low_ratings = data_reviews_words['average_score'].value_counts()[data_reviews_words['average_score'].value_counts() < quantile_025].keys()

# filter out the values and split them
mask = data_reviews_words['average_score'].isin(list(high_and_low_ratings))
rare_positive_review = data_reviews_words[mask]['positive_review']
rare_negative_review = data_reviews_words[mask]['negative_review']

rare_positive_review = rare_positive_review.apply(lambda x: x.lower().split())
rare_negative_review = rare_negative_review.apply(lambda x: x.lower().split())

In [None]:
# creating a lists of nouns and adjectives

rare_p_words_nn = review_key_words_funk_noun(rare_positive_review.explode().value_counts().keys())
rare_n_words_nn = review_key_words_funk_noun(rare_negative_review.explode().value_counts().keys())
rare_p_words_jj = review_key_words_funk_JJ(rare_positive_review.explode().value_counts().keys())
rare_n_words_jj = review_key_words_funk_JJ(rare_negative_review.explode().value_counts().keys())

In [None]:
# We select the first twenty values and add them to the set. 
# This is necessary to exclude duplicates.

set_of_all_words = set()
set_of_all_words = set_of_all_words.union(set(list_p_key_words_noun[:20]))
set_of_all_words = set_of_all_words.union(set(list_n_key_words_noun[:20]))
set_of_all_words = set_of_all_words.union(set(list_p_key_words_jj[:20]))
set_of_all_words = set_of_all_words.union(set(list_n_key_words_jj[:20]))

set_of_all_words = set_of_all_words.union(set(rare_p_words_nn[:20]))
set_of_all_words = set_of_all_words.union(set(rare_n_words_nn[:20]))
set_of_all_words = set_of_all_words.union(set(rare_p_words_jj[:20]))
set_of_all_words = set_of_all_words.union(set(rare_n_words_jj[:20]))

In [None]:
# Creating binary columns

data_p_words_enc = \
    encounter_funk_columns(data_reviews_words,'positive_review',list(set_of_all_words),'pw_')
data_n_words_enc = \
    encounter_funk_columns(data_reviews_words,'negative_review',list(set_of_all_words),'nw_')

In [None]:
# Let's save the features separately
data_p_words_enc.to_csv('intermediate_calculations/data_p_words_enc.csv', index=False, sep=',')
data_n_words_enc.to_csv('intermediate_calculations/data_n_words_enc.csv', index=False, sep=',')

### Hotel name

Let's try to fins the names of hotels

In [None]:
data_hotel_name= data.copy()

In [None]:
# We take the top 10 most popular hotels, and rename the other like 'other'

unique_list = data_hotel_name['hotel_name'].value_counts().index[:10]
data_hotel_name['hotel_name'] = data_hotel_name['hotel_name'].apply(lambda x: x if x in unique_list else 'other')

# encoding them
encoder = ce.OrdinalEncoder(data_hotel_name['hotel_name'])
data_hotel_name_enc = encoder.fit_transform(data_hotel_name['hotel_name'])

In [None]:
data_hotel_name_enc.to_csv('intermediate_calculations/data_hotel_name_enc.csv', index=False)

### Nationality

Binary encode the attribute 'nationality'

In [None]:
data_nationality=data.copy()

In [None]:
bin_encoder = ce.OrdinalEncoder(cols=['reviewer_nationality']) 
data_nationality_enc = bin_encoder.fit_transform(data_nationality['reviewer_nationality'])


In [None]:
# Let's save the features separately
data_nationality_enc.to_csv('intermediate_calculations/data_nationality_enc.csv', index=False, sep=',')

### Day since review category

Perhaps the date of the review may matter

In [None]:
data_dete_scince_review = data.copy()
data_dete_scince_review['days_since_review'] = \
    data_dete_scince_review['days_since_review'].str.findall(r'\d+').str[0].astype(int)
days_since_review = data_dete_scince_review['days_since_review']

In [None]:
days_since_review.to_csv('intermediate_calculations/data_dete_scince_review_enc.csv', index=False, sep=',')

# Fuetere selection 

Here we experiment, add and modify functions to train the model.

In [None]:
data_for_concat = data.copy()
                                                        
data_for_concat = data_for_concat[[
                        'average_score',
                        'review_total_negative_word_counts',
                        'review_total_positive_word_counts',
                        'total_number_of_reviews_reviewer_has_given',
                        'sample',
                        'reviewer_score',
                        'additional_number_of_scoring'
                        ]]

data_for_concat.to_csv('intermediate_calculations/data_for_concat.csv', index=False, sep=',')

In [None]:
data_for_concat = pd.read_csv('intermediate_calculations/data_for_concat.csv', sep=',')
data_seasons_enc = pd.read_csv('intermediate_calculations/data_seasons_enc.csv', sep=',')
data_months_enc = pd.read_csv('intermediate_calculations/data_months_enc.csv', sep=',')
data_distance_enc = pd.read_csv('intermediate_calculations/data_distance_enc.csv', sep=',')
data_tags_count = pd.read_csv('intermediate_calculations/data_tags_count.csv', sep=',')
data_tags_enc = pd.read_csv('intermediate_calculations/data_tags_enc.csv', sep=',')
data_pr_noun_enc = pd.read_csv('intermediate_calculations/data_pr_noun_enc.csv', sep=',')
data_nr_noun_enc = pd.read_csv('intermediate_calculations/data_nr_noun_enc.csv', sep=',')
data_p_words_enc = pd.read_csv('intermediate_calculations/data_p_words_enc.csv', sep=',')
data_n_words_enc = pd.read_csv('intermediate_calculations/data_n_words_enc.csv', sep=',')
data_h_name_enc = pd.read_csv('intermediate_calculations/data_h_name_enc.csv', sep=',')
data_phrases_sentiment = pd.read_csv('intermediate_calculations/data_phrases_sentiment.csv', sep=',')
data_nationality_enc = pd.read_csv('intermediate_calculations/data_nationality_enc.csv', sep=',')
data_dete_scince_review_enc = pd.read_csv('intermediate_calculations/data_dete_scince_review_enc.csv', sep=',')
data_hotel_name_enc = pd.read_csv('intermediate_calculations/data_hotel_name_enc.csv', sep=',')
new_data_tags = pd.read_csv('intermediate_calculations/new_data_tags.csv', sep=',')
reviews_words_count = pd.read_csv('intermediate_calculations/new_data_tags.csv', sep=',')

In [None]:
concat_list = [
    data_for_concat,
    data_months_enc.astype('int8'),
    #data_distance_enc.astype('int16'), the "distance to the city center" feature worsens prediction
    data_tags_count.astype('int16'),
    data_p_words_enc.astype('int8'),
    data_n_words_enc.astype('int8'), 
    data_phrases_sentiment.astype('float16'),
    data_nationality_enc.astype('int16'),
    data_hotel_name_enc,
    new_data_tags.astype('int8'),
    reviews_words_count.astype('int16')
]
     
data_concatenation = pd.concat(concat_list,axis=1)

In [None]:
data_concatenation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515431 entries, 0 to 515430
Columns: 177 entries, average_score to room_type_
dtypes: float16(2), float64(2), int16(10), int64(6), int8(157)
memory usage: 120.4 MB


# Multicollinearity

In [None]:
data_for_corr = data_concatenation.copy()

In [None]:
import plotly.express as px

fig = px.imshow(data_for_corr.drop(['sample'], axis=1).corr(),\
    width=800, height=800,color_continuous_scale='RdBu_r')

fig.show()

Picture for git

<img src="data/corr1.png">

# Removing features

In [None]:
# We remove some of the features with multicorrelation

list_for_drop = ['pw_hot','nw_comfortable','tag_from_mobile','nw_hot']

data_for_corr = data_for_corr.drop(list_for_drop, axis=1)


In [None]:
fig = px.imshow(data_for_corr.drop(['sample'], axis=1).corr(),width=800, height=800,color_continuous_scale='RdBu_r')

fig.show()

Picture for git

<img src="data/corr2.png">

# Modeling

In [None]:
data_for_train = data_for_corr

In [None]:
# let's select the test part
train_data = data_for_train.query('sample == 1').drop(['sample'], axis=1)
test_data = data_for_train.query('sample == 0').drop(['sample'], axis=1)

y = train_data.reviewer_score.values
X = train_data.drop(['reviewer_score'], axis=1)

# Let's use the special function train_test_split to split the test data
# allocate 20% of the data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

### Definition of hyperparameters

In [None]:
def optuna_rf(trial, cv=2, X=X_train, y=y_train, random_state=RANDOM_SEED):

  n_estimators = trial.suggest_int('n_estimators', 40, 200, 20)
  max_depth = trial.suggest_int('max_depth', 16, 30, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 6, 16, 1)

  model = RandomForestRegressor(n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_leaf=min_samples_leaf,
                                random_state=random_state,
                                verbose=1, 
                                n_jobs=-1 )

  model.fit(X_train, y_train)
  score = cross_val_score(model, X, y, cv=cv, scoring="neg_mean_absolute_error", n_jobs=-1).mean()
  return score

In [None]:
study = optuna.create_study(study_name="RandomForestRegressor", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=20)

In [None]:
study.best_params

{'n_estimators': 180, 'max_depth': 20, 'min_samples_leaf': 6}

### Creating a pipeline and saving the model

In [None]:
pipeline = Pipeline([('rf', RandomForestRegressor())])

In [None]:
pipeline.set_params(rf__n_estimators=180, 
                    rf__max_depth=20,
                    rf__min_samples_leaf=6,
                    rf__verbose=1, 
                    rf__n_jobs=-1
                    )

In [None]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

joblib.dump(pipeline, 'pipeline_hotel.pkl')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  8.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    2.3s finished


MAE: 0.853884650238378


['pipeline_hotel.pkl']

In [None]:
pipeline = joblib.load('pipeline_hotel.pkl')

In [None]:
metrics.mean_absolute_percentage_error(y_test, y_pred)

0.12348731350674301