In [339]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [340]:
hotels = pd.read_csv('data/hotels.csv')
print(hotels['tags'][1])

[' Business trip ', ' Couple ', ' Standard Double Room ', ' Stayed 1 night ']


In [341]:
hotels['review_words_proportion'] =\
    hotels['review_total_positive_word_counts']\
        /((hotels['review_total_positive_word_counts']\
            +hotels['review_total_negative_word_counts'])/100)
hotels['review_words_proportion'].fillna(0)

0          57.142857
1          40.000000
2           0.000000
3         100.000000
4          83.333333
             ...    
386798    100.000000
386799    100.000000
386800     42.857143
386801    100.000000
386802     50.000000
Name: review_words_proportion, Length: 386803, dtype: float64

In [342]:
unique_list = []
# пробегаемся по именам столбцов в таблице
for col in hotels.columns:
    # создаём кортеж (имя столбца, число уникальных значений)
    item = (col, hotels[col].nunique()) 
    # добавляем кортеж в список
    unique_list.append(item)
# создаём вспомогательную таблицу и сортируем её
unique_counts = pd.DataFrame(
    unique_list,
    columns=['Column_Name', 'Num_Unique']
).sort_values(by='Num_Unique',  ignore_index=True)
# выводим её на экран
display(unique_counts)

Unnamed: 0,Column_Name,Num_Unique
0,average_score,34
1,reviewer_score,37
2,total_number_of_reviews_reviewer_has_given,194
3,reviewer_nationality,225
4,review_total_positive_word_counts,354
5,review_total_negative_word_counts,402
6,additional_number_of_scoring,480
7,review_date,731
8,days_since_review,731
9,total_number_of_reviews,1142


In [343]:
hotels = hotels.fillna(0)

Превращаем **average_score** в категорию.

In [344]:
counter = 0

average_score_preserved_mapper = dict()

for i in sorted(set(hotels['average_score'])):
    average_score_preserved_mapper.update({i:counter})
    counter += 1

print(average_score_preserved_mapper)

{5.2: 0, 6.4: 1, 6.6: 2, 6.7: 3, 6.8: 4, 6.9: 5, 7.0: 6, 7.1: 7, 7.2: 8, 7.3: 9, 7.4: 10, 7.5: 11, 7.6: 12, 7.7: 13, 7.8: 14, 7.9: 15, 8.0: 16, 8.1: 17, 8.2: 18, 8.3: 19, 8.4: 20, 8.5: 21, 8.6: 22, 8.7: 23, 8.8: 24, 8.9: 25, 9.0: 26, 9.1: 27, 9.2: 28, 9.3: 29, 9.4: 30, 9.5: 31, 9.6: 32, 9.8: 33}


In [345]:
hotels['average_score'] = hotels['average_score'].replace(average_score_preserved_mapper)
hotels['average_score'] = hotels['average_score'].astype(int)

In [346]:
#print(hotels['average_score'])
#hotels['average_score'] = hotels['average_score'].astype('category')

In [347]:
#max_unique_count = 50 # задаём максимальное число уникальных категорий
#for col in hotels.columns: # цикл по именам столбцов
#    if hotels[col].nunique() < max_unique_count: # проверяем условие
#        hotels[col] = hotels[col].astype('category') # преобразуем тип столбца
#display(hotels.info())

**Создаём признак hotel_country: страна, где находится отель**

In [348]:
countrified = pd.DataFrame()

def countrifier(string):
    cut = string.split()

    country = cut[-1:]

    country = country[0]

    if country == 'Kingdom':
        country = 'United Kingdom'
    return country

hotels['hotel_country'] = hotels['hotel_address'].apply(countrifier)

display(hotels['hotel_country'].value_counts())

United Kingdom    196774
Spain              45132
France             44830
Netherlands        43006
Austria            29178
Italy              27883
Name: hotel_country, dtype: int64

In [349]:
# В reviewer_nationality лишние пробелы
# и эта функция убирает их
def ws_cutter(string):
    string = string[1:-1]
    return(string)

hotels['reviewer_nationality'] =\
    hotels['reviewer_nationality'].apply(ws_cutter)

**Получаем признак is_homeland – сравнение национальности и страны отеля**

In [350]:
hotels['is_homeland'] = (hotels['hotel_country']==hotels['reviewer_nationality'])
comp_pd = pd.DataFrame()
comp_pd['nat'], comp_pd['htl'], comp_pd['=='] =\
    hotels['reviewer_nationality'], hotels['hotel_country'], hotels['is_homeland']
display(hotels[['hotel_country', 'reviewer_nationality', 'is_homeland']].head(3))

Unnamed: 0,hotel_country,reviewer_nationality,is_homeland
0,United Kingdom,United Kingdom,True
1,United Kingdom,United Kingdom,True
2,France,China,False


In [351]:
print(hotels['hotel_country'].describe(),hotels['reviewer_nationality'].info())

<class 'pandas.core.series.Series'>
RangeIndex: 386803 entries, 0 to 386802
Series name: reviewer_nationality
Non-Null Count   Dtype 
--------------   ----- 
386803 non-null  object
dtypes: object(1)
memory usage: 3.0+ MB
count             386803
unique                 6
top       United Kingdom
freq              196774
Name: hotel_country, dtype: object None


In [353]:
counter = 0

average_score_preserved_mapper = dict()

for i in sorted(set(hotels['hotel_country'])):
    average_score_preserved_mapper.update({i:counter})
    counter += 1

print(average_score_preserved_mapper)

{'Austria': 0, 'France': 1, 'Italy': 2, 'Netherlands': 3, 'Spain': 4, 'United Kingdom': 5}


In [352]:
hotels['hotel_country'] = hotels['hotel_country'].astype('category')
hotels['hotel_country'].describe()

count             386803
unique                 6
top       United Kingdom
freq              196774
Name: hotel_country, dtype: object

In [354]:
counter = 0

hotel_country_preserved_mapper = dict()

for i in sorted(set(hotels['hotel_country'])):
    hotel_country_preserved_mapper.update({i:counter})
    counter += 1

print(hotel_country_preserved_mapper)

{'Austria': 0, 'France': 1, 'Italy': 2, 'Netherlands': 3, 'Spain': 4, 'United Kingdom': 5}


In [355]:
hotels['average_score'] = hotels['average_score'].replace(average_score_preserved_mapper)
hotels['average_score'] = hotels['average_score'].astype(int)

In [316]:
hotels['days_since_review'] = ((pd.to_datetime('2017-08-04')\
    -pd.to_datetime(hotels['review_date']))/ np.timedelta64(1, 'D'))
print(hotels['days_since_review'][0])

532.0


In [317]:
def rev_func(num_tags):
    num_tags = num_tags[2:-2]
    res = num_tags.strip().split(' \', \' ')
    return res

hotels['tags_n'] = hotels['tags'].apply(rev_func)
enum = hotels.explode('tags_n')

res = enum['tags_n'].value_counts()
print(res)

Leisure trip                         313593
Submitted from a mobile device       230778
Couple                               189212
Stayed 1 night                       145373
Stayed 2 nights                      100263
                                      ...  
Studio with Spa Access                    1
Comfort Family Room                       1
Junior Suite Free Wifi                    1
Design Suite                              1
Executive Double Room Non Smoking         1
Name: tags_n, Length: 2368, dtype: int64


In [318]:
pop_tags = enum['tags_n'].value_counts().nlargest(50).index
print(pop_tags)

Index(['Leisure trip', 'Submitted from a mobile device', 'Couple',
       'Stayed 1 night', 'Stayed 2 nights', 'Solo traveler', 'Stayed 3 nights',
       'Business trip', 'Group', 'Family with young children',
       'Stayed 4 nights', 'Double Room', 'Standard Double Room',
       'Superior Double Room', 'Family with older children',
       'Deluxe Double Room', 'Double or Twin Room', 'Stayed 5 nights',
       'Standard Double or Twin Room', 'Classic Double Room',
       'Superior Double or Twin Room', '2 rooms', 'Stayed 6 nights',
       'Standard Twin Room', 'Single Room', 'Twin Room', 'Stayed 7 nights',
       'Executive Double Room', 'Classic Double or Twin Room',
       'Superior Twin Room', 'Club Double Room', 'Deluxe Double or Twin Room',
       'Queen Room', 'Deluxe King Room', 'Superior Queen Room',
       'Standard Single Room', 'Junior Suite', 'Triple Room', 'Classic Room',
       'Superior Room', 'Superior King Room', 'Standard Room', 'Deluxe Room',
       'Double Guest Roo

In [319]:
print(enum['tags_n'][0:1])
len(enum['tags_n'][0:1])

0    Leisure trip
Name: tags_n, dtype: object


1

In [320]:
#for i in enum['tags_n']:
#    for k in len(i):
#        print(i[k])
#        k = k.lower()

In [321]:
display(set(enum['tags_n']))

{'Superior Double or Twin Room with Balcony 1 Adult',
 'Deluxe Queen Guestroom',
 'Travelers with friends',
 'Junior Deluxe Double Room',
 'Executive Triple Room',
 'Ambassadors Room',
 'Classic Twin or Double Room',
 'Comfort Double or Twin Room with Rambla View',
 'Deluxe Room with 1 Double Bed',
 'Wonderful Bank King',
 'Two Adjoining Superior Suite',
 'Double Room Large Heritage',
 'Executive Deluxe Double Room with Spa Access',
 'Premium Double or Twin Room',
 'Deluxe King Room with Garden View',
 'Standard Room with 1 Kingsize Bed',
 'Superior Twin Room with Tea Time Included',
 'Executive King Room Non Smoking with Executive Lounge Access',
 'Executive Vend me Triple Room',
 'Club Room with Club Lounge Access Free Wifi',
 'Park Deluxe Room',
 'Large Triple Room',
 'Sensation Room',
 'Superior Double or Twin Room with Terrace',
 'Club Twin Room',
 'Executive Room Free Executive Lounge Access',
 'Luxury Room Club Sofitel with Spa Access',
 'Premium Double Room with Balcony',
 'Mar

In [322]:
hotels['nights'] = 1
night_counter = pd.Series().astype(int)
counter = 0

for tags_listed in hotels['tags_n']:
    for i in tags_listed:
        if i[0:6] == 'Stayed':
            hotels['nights'][counter] = i[6:9]
    counter += 1

  night_counter = pd.Series().astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotels['nights'][counter] = i[6:9]


In [323]:
#hotels['tags_filtered'] = enum['tags_n'].apply(lambda x: x if x in pop_tags else 'other')
#display(hotels['tags_filtered'])

In [325]:
hotels['nights'] = hotels['nights'].astype(int)
hotels['nights'].value_counts()

1    147262
2    100348
3     72010
4     35748
5     15611
6      7399
7      5549
8      1910
9       966
Name: nights, dtype: int64

In [326]:
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 22 columns):
 #   Column                                      Non-Null Count   Dtype   
---  ------                                      --------------   -----   
 0   hotel_address                               386803 non-null  object  
 1   additional_number_of_scoring                386803 non-null  int64   
 2   review_date                                 386803 non-null  object  
 3   average_score                               386803 non-null  int32   
 4   hotel_name                                  386803 non-null  object  
 5   reviewer_nationality                        386803 non-null  object  
 6   negative_review                             386803 non-null  object  
 7   review_total_negative_word_counts           386803 non-null  int64   
 8   total_number_of_reviews                     386803 non-null  int64   
 9   positive_review                             386803 non-null

In [327]:
hotels.drop(['lat','lng', 'days_since_review', 'review_date','hotel_country', 'review_words_proportion', 'is_homeland', 'nights'], axis=1, inplace=True)
# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели  
# Х - данные с информацией об отелях, у - целевая переменная (рейтинги отелей)
X = hotels.drop(['reviewer_score'], axis = 1)  
Y = hotels['reviewer_score']

In [328]:
num_cols = ['total_numbers_of_reviews', 'review_total_negative_word_count','additional_number_of_scoring']

cat_cols = ['average_score']

In [329]:
Y = Y.astype('int')


from sklearn.feature_selection import chi2 # хи-квадрат

#imp_cat = pd.Series(chi2(X[cat_cols], Y)[0], index=cat_cols)
#imp_cat.sort_values(inplace = True)
#imp_cat.plot(kind = 'barh')

In [330]:
# Загружаем специальный инструмент для разбивки:  
from sklearn.model_selection import train_test_split  

In [331]:
object_columns = [s for s in X.columns if X[s].dtypes == 'object']
print(object_columns)
X.drop(object_columns, axis = 1, inplace=True)
#X.drop('review_date',axis=1, inplace=True)

['hotel_address', 'hotel_name', 'reviewer_nationality', 'negative_review', 'positive_review', 'tags', 'tags_n']


In [332]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.  
# Для тестирования мы будем использовать 25% от исходного датасета.  
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [333]:
X_train.head(5)

Unnamed: 0,additional_number_of_scoring,average_score,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given
22852,488,18,43,4645,55,13
67350,234,5,6,1884,9,1
95937,950,22,2,3486,2,1
352230,285,23,7,3181,5,4
73517,103,28,6,786,0,2


In [334]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 290102 entries, 22852 to 121958
Data columns (total 6 columns):
 #   Column                                      Non-Null Count   Dtype
---  ------                                      --------------   -----
 0   additional_number_of_scoring                290102 non-null  int64
 1   average_score                               290102 non-null  int32
 2   review_total_negative_word_counts           290102 non-null  int64
 3   total_number_of_reviews                     290102 non-null  int64
 4   review_total_positive_word_counts           290102 non-null  int64
 5   total_number_of_reviews_reviewer_has_given  290102 non-null  int64
dtypes: int32(1), int64(5)
memory usage: 14.4 MB


 0   additional_number_of_scoring                96701 non-null  int64

 1   average_score                               96701 non-null  float64

 2   review_total_negative_word_counts           96701 non-null  int64

 3   total_number_of_reviews                     96701 non-null  int64

 4   review_total_positive_word_counts           96701 non-null  int64
 
 5   total_number_of_reviews_reviewer_has_given  96701 non-null  int64

 MAPE: 0.17100510492941573

MAPE: 0.1714944134645002

MAPE: 0.16508613530687957

In [335]:
# Импортируем необходимые библиотеки:  
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели  
from sklearn import metrics # инструменты для оценки точности модели  
  
# Создаём модель  
regr = RandomForestRegressor(n_estimators=100)  
      
# Обучаем модель на тестовом наборе данных  
regr.fit(X_train, y_train)  
      
# Используем обученную модель для предсказания рейтинга отелей в тестовой выборке.  
# Предсказанные значения записываем в переменную y_pred  
y_pred = regr.predict(X_test)  


In [336]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они отличаются  
# Метрика называется Mean Absolute Percentage Error (MAPE) и показывает среднюю абсолютную процентную ошибку предсказанных значений от фактических.  
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.168007387897149


Небольшой бонус:


In [337]:
# # убираем признаки которые еще не успели обработать, 
# # модель на признаках с dtypes "object" обучаться не будет, просто выберим их и удалим
# object_columns = [s for s in hotels.columns if hotels[s].dtypes == 'object']
# hotels.drop(object_columns, axis = 1, inplace=True)

# # заполняем пропуски самым простым способом
# hotels = hotels.fillna(0)