In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

Загрузим данные и посмотрим на них.

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Тинькофф.Поколение/train.csv', encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [4]:
np.unique(df['Hotel_name']).shape # название отеля - категориальный признак

(243,)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2351 non-null   int64  
 1   Hotel_name    2351 non-null   object 
 2   Review_Title  2136 non-null   object 
 3   Review_Text   2351 non-null   object 
 4   Rating        2351 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 92.0+ KB


Мы решаем задачу регрессии. Метрика - MSE/RMSE.

# Preprocessing.

Разберемся с NaN.

In [6]:
df.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

In [7]:
df['Review_Title'].fillna('no description', inplace=True) # заполним Nan'ы

Выкинем столбец Id, он бесполезен.

In [8]:
df = df.drop('Id', axis=1)

## Обработаем текстовые признаки, feature engineering.

### Первичная обработка: lower-case и убираем знаки препинания.

In [9]:
import re
df['Review_Text'] = df['Review_Text'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower())))
df['Review_Title'] = df['Review_Title'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower())))

In [10]:
df.head()

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating
0,Park Hyatt,refuge in chennai,excellent room and exercise facility all aroun...,80.0
1,Hilton Chennai,hilton chennai,very comfortable and felt safe staff were very...,100.0
2,The Royal Regency,no worth the rating shown in websites pricing ...,not worth the rating shown service is not good...,71.0
3,Rivera,good stay,first of all nice courteous staff only one con...,86.0
4,Park Hyatt,needs improvement,overall ambience of the hotel is very good in ...,86.0


### Обработка слов в текстовых признаках.

Нам предоставлены текстовые признаки на английском языке. С ними хорошо справится лемматизатор.

In [11]:
# Позаимствуем функции для лемматизации текста из семинарского ноутбука;)
from nltk import wordnet, pos_tag

def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN

In [12]:
import nltk

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [13]:
from nltk import WordNetLemmatizer

def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

Лемматизация.

In [14]:
df['Review_Text'] = df['Review_Text'].apply(my_lemmatizer) # лемматизируем и тексты, и заголовки
df['Review_Title'] = df['Review_Title'].apply(my_lemmatizer)

In [15]:
df.head()

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating
0,Park Hyatt,refuge in chennai,excellent room and exercise facility all aroun...,80.0
1,Hilton Chennai,hilton chennai,very comfortable and felt safe staff be very h...,100.0
2,The Royal Regency,no worth the rating show in website price be ok,not worth the rating show service be not good ...,71.0
3,Rivera,good stay,first of all nice courteous staff only one con...,86.0
4,Park Hyatt,need improvement,overall ambience of the hotel be very good in ...,86.0


### Очищаем данные от stopwords.

Удалим филлерные слова, они не дают нам никакой информации и только мешают.

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
from nltk.corpus import stopwords
sw_eng = set(stopwords.words('english'))

In [18]:
# выкидываем все филлеры
df['Review_Text'] = df['Review_Text'].apply(lambda x: ' '.join([word for word in x.split() if not word in sw_eng]))
df['Review_Title'] = df['Review_Title'].apply(lambda x: ' '.join([word for word in x.split() if not word in sw_eng]))

In [19]:
df.head()

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating
0,Park Hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0
1,Hilton Chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0
2,The Royal Regency,worth rating show website price ok,worth rating show service good room well maint...,71.0
3,Rivera,good stay,first nice courteous staff one con stay time c...,86.0
4,Park Hyatt,need improvement,overall ambience hotel good room facility need...,86.0


Well Done! Наконец-то нормальный читаемый текст!

### Подсчет pos/neg слов.

Интересно и полезно было бы посчитать количество слов с позитивной/негативной окраской в каждом тексте и получить новые признаки. Тем более нам предоставлены файлы - словари.

In [20]:
# Напишем функции для подсчета количества положительных и негативных слов
# ищем вхождения в предоставленные файлы
def count_pos(text):
  cnt_pos = 0
  for word in re.findall(r'[^,.;\s]+', text.lower()): # парсим данные регулярными выражениями
    with open('/content/drive/MyDrive/Тинькофф.Поколение/positive-words.txt') as f:
      if word in f.read().split():
        cnt_pos += 1
  return cnt_pos

def count_neg(text):
  cnt_neg = 0
  for word in re.findall(r'[^,.;\s]+', text.lower()):
    with open('/content/drive/MyDrive/Тинькофф.Поколение/negative-words.txt', encoding="ISO-8859-1") as g:
      if word in g.read().split():
        cnt_neg += 1

  return cnt_neg

In [21]:
df['Pos_Words'] = df['Review_Text'].apply(count_pos) # Введем новые признаки - количество позитивных и негативных слов
df['Neg_Words'] = df['Review_Text'].apply(count_neg)

In [22]:
df.head()

Unnamed: 0,Hotel_name,Review_Title,Review_Text,Rating,Pos_Words,Neg_Words
0,Park Hyatt,refuge chennai,excellent room exercise facility around atmosp...,80.0,4,0
1,Hilton Chennai,hilton chennai,comfortable felt safe staff helpful respectful...,100.0,4,0
2,The Royal Regency,worth rating show website price ok,worth rating show service good room well maint...,71.0,8,1
3,Rivera,good stay,first nice courteous staff one con stay time c...,86.0,4,2
4,Park Hyatt,need improvement,overall ambience hotel good room facility need...,86.0,2,0


## Представление текста.

In [23]:
y = df['Rating']

### Count Vectorizer.

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
text_cv = cv.fit_transform(df['Review_Text'])
title_cv = cv.fit_transform(df['Review_Title'])

In [25]:
df_cv = df.drop(['Review_Title', 'Review_Text', 'Rating'], axis=1)

In [26]:
df_cv = pd.concat([df_cv, pd.DataFrame(text_cv.toarray())], axis=1) # добавляем эмбеддинги в конец датасета
df_cv = pd.concat([df_cv, pd.DataFrame(title_cv.toarray())], axis=1)
df_cv.columns = np.arange(df_cv.shape[1])

In [27]:
df_cv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4532,4533,4534,4535,4536,4537,4538,4539,4540,4541,4542,4543,4544,4545,4546,4547,4548,4549,4550,4551,4552,4553,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571
0,Park Hyatt,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Hilton Chennai,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Royal Regency,8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,Rivera,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Park Hyatt,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### TF-IDF.

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
text_tf = tfidf.fit_transform(df['Review_Text'])
title_tf = tfidf.fit_transform(df['Review_Title'])

In [29]:
df_tf = df.drop(['Review_Title', 'Review_Text', 'Rating'], axis=1)

In [30]:
df_tf = pd.concat([df_tf, pd.DataFrame(text_tf.toarray())], axis=1) # добавляем эмбеддинги в конец датасета
df_tf = pd.concat([df_tf, pd.DataFrame(title_tf.toarray())], axis=1)
df_tf.columns = np.arange(df_tf.shape[1])

In [31]:
df_tf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4532,4533,4534,4535,4536,4537,4538,4539,4540,4541,4542,4543,4544,4545,4546,4547,4548,4549,4550,4551,4552,4553,4554,4555,4556,4557,4558,4559,4560,4561,4562,4563,4564,4565,4566,4567,4568,4569,4570,4571
0,Park Hyatt,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hilton Chennai,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The Royal Regency,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.478205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Rivera,4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Park Hyatt,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Word2Vec

In [32]:
import io
from tqdm import tqdm
from itertools import islice

def load_vectors(fname, limit): # воспользуемся функциями из конспектов:)
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(islice(fin, limit), total=limit):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

In [33]:
def text2vec(text):
    words = text.split()
    if len(words) == 0:
      words = ['no', 'description']
      
    return sum(list(map(lambda w: np.array(list(vecs.get(w, zero))), words))) / len(words)

In [34]:
vecs = load_vectors('/content/drive/MyDrive/Тинькофф.Поколение/crawl-300d-2M.vec', 2000000)
zero = sum(vecs.values()) / len(vecs)

100%|█████████▉| 1999995/2000000 [03:39<00:00, 9094.39it/s] 


In [93]:
text_w = np.array([list(text2vec(row[1]['Review_Text'])) for row in df.iterrows()]) # получаем массив эмбеддингов
title_w = np.array([list(text2vec(row[1]['Review_Title'])) for row in df.iterrows()])

In [94]:
df_w = df.drop(['Review_Title', 'Review_Text', 'Rating'], axis=1)

In [95]:
df_w = pd.concat([df_w, pd.DataFrame(text_w)], axis=1) # добавляем эмбеддинги в конец датасета
df_w = pd.concat([df_w, pd.DataFrame(title_w)], axis=1)
df_w.columns = np.arange(df_w.shape[1])

In [38]:
df_w.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602
0,Park Hyatt,4,0,-0.014004,-0.101965,-0.026465,0.025152,-0.1276,-0.007117,-0.062278,0.057674,0.08513,0.00963,-0.090717,0.185596,-0.148783,0.021091,-0.070617,-0.038048,0.154661,-0.024004,-0.103322,0.060804,0.05273,-0.003935,-0.063787,-0.030743,-0.046135,-0.121448,0.058326,0.06217,-0.018583,-0.030626,-0.034652,0.02743,0.009557,-0.20637,0.112361,-0.051543,0.091639,...,0.1024,-0.0039,0.14065,0.21305,0.0795,0.2315,-0.07855,-0.13705,-0.15135,-0.3828,-0.19155,0.31185,-0.06515,0.36125,-0.23885,0.2403,0.0401,0.0408,-0.46865,0.08305,0.2052,0.32855,0.1514,0.23185,-0.2801,-0.0295,-0.2849,0.0245,-0.3082,-0.24665,-0.05605,0.1324,0.0542,0.07285,0.3067,0.115,-0.2058,-0.28315,-0.04335,-0.00345
1,Hilton Chennai,4,0,0.01769,-0.055755,-0.011997,0.011497,-0.171596,-0.016755,-0.16551,0.013907,0.047523,0.052247,-0.025587,0.189842,-0.014791,-0.023137,-0.028009,-0.014579,0.070435,-0.014351,0.056497,0.019177,-0.020679,-0.004656,0.027301,-0.018829,-0.152059,-0.139388,0.047258,-0.056679,-0.044086,0.085676,-0.081306,-0.069479,0.008501,-0.189087,0.079137,-0.056556,0.11393,...,-0.09785,-0.345,-0.0199,-0.0511,-0.27425,0.17775,-0.09225,-0.3523,-0.4223,-0.36325,-0.04845,0.1634,0.235,0.48345,-0.01305,0.3376,0.02325,-0.029,-0.06805,0.5393,0.2432,-0.0064,-0.2806,0.12595,-0.30255,0.0023,-0.0074,0.3668,-0.0231,-0.22955,0.0137,0.50715,-0.05795,0.3113,0.23315,0.15535,-0.17905,-0.10455,-0.1001,-0.10275
2,The Royal Regency,8,1,-0.072474,0.006027,-0.053873,0.04357,-0.162497,-0.028737,-0.089817,0.041387,0.113736,-0.078925,0.014329,0.227682,-0.072712,-0.009276,-0.091727,-0.038962,0.023661,0.010412,0.014744,0.023122,0.060427,-0.047627,0.031286,-0.009167,-0.073775,-0.114346,0.002885,0.088048,-0.00657,-0.026839,-0.074125,0.02582,-0.014724,-0.237949,0.077551,-0.060752,0.108247,...,0.085533,-0.0709,0.063233,-0.059183,0.002367,0.0879,0.058017,0.0197,-0.01965,-0.01725,0.121867,-0.117567,-0.002,-0.1261,0.05495,0.076383,-0.037017,0.011233,0.173283,0.103167,0.067983,-0.00745,-0.041783,-0.106767,-0.094167,-0.073083,0.025517,0.153117,-0.125917,-0.020883,-0.1009,0.017083,-0.107033,-0.015367,-0.169,0.034017,-0.076383,0.0792,-0.042883,0.001283
3,Rivera,4,2,-0.0105,-0.111319,-0.029138,0.02831,-0.13619,-0.000314,-0.045262,0.117424,0.008257,-0.076786,-0.049867,0.170467,-0.032414,0.013138,-0.085476,-0.025219,0.10891,-0.04749,0.024938,0.05309,0.100224,0.037019,-0.041071,-0.109467,-0.104995,-0.090395,0.07279,-0.018248,-0.016167,0.107676,-0.021324,-0.012443,0.050124,-0.207795,0.101719,-0.102695,0.057981,...,0.04905,-0.04435,-0.01915,0.00985,-0.1566,0.0274,-0.1172,-0.0847,-0.00435,-0.0611,0.0157,-0.1206,-0.08115,0.01595,0.08765,0.06055,0.1637,-0.1206,-0.10275,-0.06115,0.1424,0.07835,-0.0072,0.1039,-0.1955,-0.0551,0.0786,0.09135,0.1778,-0.12485,0.0483,0.0262,-0.0848,-0.0686,0.09405,-0.15275,0.0167,0.08585,-0.14585,0.0253
4,Park Hyatt,2,0,0.001365,0.005165,-0.023358,0.081638,-0.091715,-0.025708,-0.125777,0.028123,0.106769,0.014408,-0.079131,0.178465,-0.087431,-0.004258,-0.051412,-0.012996,0.074431,-0.002158,-0.055008,0.068381,0.044135,-0.039885,-0.000215,-0.014677,-0.081842,-0.034381,0.000346,0.073688,0.012531,-0.079831,-0.080938,-0.07645,-0.076227,-0.145415,0.048608,-0.025023,0.015785,...,0.06325,-0.1345,0.2919,-0.02565,0.1183,-0.20855,-0.1015,-0.0729,-0.00645,0.0802,0.03215,-0.2208,-0.05015,0.27655,0.1509,0.0415,-0.1073,0.2573,-0.02315,0.1687,-0.0252,-0.01365,-0.079,-0.2207,0.1534,0.0146,0.0057,0.24945,0.05045,-0.08395,-0.0405,0.0241,0.0076,-0.13775,-0.0851,-0.1439,-0.04285,-0.19565,0.1941,-0.18525


# Модель, обучение и предсказание.

In [39]:
!pip install catboost # установим катбуст

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/47/80/8e9c57ec32dfed6ba2922bc5c96462cbf8596ce1a6f5de532ad1e43e53fe/catboost-0.25.1-cp37-none-manylinux1_x86_64.whl (67.3MB)
[K     |████████████████████████████████| 67.3MB 101kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.25.1


In [40]:
from sklearn.model_selection import train_test_split
 # все 3 полученных (для каждого представления текстов) датасета сплитим
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(df_cv, y, test_size=0.2, random_state=42)
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(df_tf, y, test_size=0.2, random_state=42)
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(df_w, y, test_size=0.2, random_state=42)

## CatBoost.

Count Vectorizer.

In [41]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

cbr = CatBoostRegressor(cat_features=[0], random_seed=42, verbose=False)
cbr.fit(X_train_cv, y_train_cv)
y_pred_cv = cbr.predict(X_test_cv)
print('MSE: {}'.format(mean_squared_error(y_test_cv, y_pred_cv)))
print('RMSE: {}'.format(mean_squared_error(y_test_cv, y_pred_cv, squared=False)))

MSE: 199.21628520848265
RMSE: 14.114399923782896


TF-IDF.

In [42]:
cbr = CatBoostRegressor(cat_features=[0], random_seed=42, verbose=False)
cbr.fit(X_train_tf, y_train_tf)
y_pred_tf = cbr.predict(X_test_tf)
print('MSE: {}'.format(mean_squared_error(y_test_tf, y_pred_tf)))
print('RMSE: {}'.format(mean_squared_error(y_test_tf, y_pred_tf, squared=False)))

MSE: 183.27807499645013
RMSE: 13.538023304620587


Word2Vec.

In [43]:
cbr = CatBoostRegressor(cat_features=[0], random_seed=42, verbose=False)
cbr.fit(X_train_w, y_train_w)
y_pred_w = cbr.predict(X_test_w)
print('MSE: {}'.format(mean_squared_error(y_test_w, y_pred_w)))
print('RMSE: {}'.format(mean_squared_error(y_test_w, y_pred_w, squared=False)))

MSE: 175.51253614423837
RMSE: 13.24811443731667


В статьях на Медиуме (например, в [этой](https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34)) для nlp classification активно используются такие модели, как Naive Bayes и SVM. Попробуем для задачи регрессии использовать соответствующие регрессоры.

In [45]:
df_cv_enc = pd.get_dummies(df_cv)
df_tf_enc = pd.get_dummies(df_tf)
df_w_enc = pd.get_dummies(df_w)

In [46]:
X_train_cv_enc, X_test_cv_enc, y_train_cv_enc, y_test_cv_enc = train_test_split(df_cv_enc, y, test_size=0.2, random_state=42)
X_train_tf_enc, X_test_tf_enc, y_train_tf_enc, y_test_tf_enc = train_test_split(df_tf_enc, y, test_size=0.2, random_state=42)
X_train_w_enc, X_test_w_enc, y_train_w_enc, y_test_w_enc = train_test_split(df_w_enc, y, test_size=0.2, random_state=42)

## Bayesian Ridge, Support Vector Regression, SGD Classifier.

Count Vectorizer.

In [56]:
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.linear_model import SGDClassifier

for name in [BayesianRidge, SVR, SGDClassifier]:
  model = name()
  print(model)
  model.fit(X_train_cv_enc, y_train_cv_enc)
  y_pred_cv_enc = model.predict(X_test_cv_enc)
  print('\nMSE: {}'.format(mean_squared_error(y_test_cv_enc, y_pred_cv_enc)))
  print('RMSE: {}\n'.format(mean_squared_error(y_test_cv_enc, y_pred_cv_enc, squared=False)))

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

MSE: 216.89620894870026
RMSE: 14.727396543472993

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

MSE: 314.76528867742667
RMSE: 17.741625874688786

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

MSE: 279.86411889596604
RMSE: 16.729139813390468



TF-IDF.

In [54]:
for name in [BayesianRidge, SVR, SGDClassifier]:
  model = name()
  print(model)
  model.fit(X_train_tf_enc, y_train_tf_enc)
  y_pred_tf_enc = model.predict(X_test_tf_enc)
  print('\nMSE: {}'.format(mean_squared_error(y_test_tf_enc, y_pred_tf_enc)))
  print('RMSE: {}\n'.format(mean_squared_error(y_test_tf_enc, y_pred_tf_enc, squared=False)))

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

MSE: 196.209815868169
RMSE: 14.007491419528659

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

MSE: 324.92088437047005
RMSE: 18.025561971003015

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

MSE: 270.7515923566879
RMSE: 16.454531058547



Word2Vec.

In [57]:
for name in [BayesianRidge, SVR, SGDClassifier]:
  model = name()
  print(model)
  model.fit(X_train_w_enc, y_train_w_enc)
  y_pred_w_enc = model.predict(X_test_w_enc)
  print('\nMSE: {}'.format(mean_squared_error(y_test_w_enc, y_pred_w_enc)))
  print('RMSE: {}\n'.format(mean_squared_error(y_test_w_enc, y_pred_w_enc, squared=False)))

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)

MSE: 223.71053429252595
RMSE: 14.95695605036419

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

MSE: 309.2319401542456
RMSE: 17.58499190088655

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

MSE: 315.9447983014862
RMSE: 17.774836097739023



# Заключительная часть.

## Модель.

После всех проведенных испытаний стало ясно, что лучше всех справилась модель `CatBoostRegressor` со всеми видами представления данных. Немного хуже сработал `BayesianRidge`, о `SVR` и `SGDClassifier` лучше умолчать:)

## Представление текстовых данных.

`CatBoostRegressor` показал лучший результат при работе с `Word2Vec`. `BayesianRidge` лучше всего сработал, наоборот, с `TF-IDF`, а с `Word2Vec` показал худший результат. Остальные модели далеко позади.

## Вывод.

Таким образом, фаворит сегодняшнего вечера - `CatBoostRegressor` с представлением текстовых данных с помощью `Word2Vec`.

Результат:

MSE: 175.51253614423837

RMSE: 13.24811443731667

## Предсказание на тестовой части.

In [103]:
df_test = pd.read_csv('/content/drive/MyDrive/Тинькофф.Поколение/test.csv', encoding='ISO-8859-1')

In [104]:
id_col = df_test['Id']

In [105]:
df_test.isna().sum() # проверим на Nan'ы

Id                0
Hotel_name        0
Review_Title    209
Review_Text       0
dtype: int64

In [78]:
df_test['Review_Title'].fillna('no description', inplace=True) # заполним Nan'ы

In [79]:
df_test['Review_Text'] = df_test['Review_Text'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower()))) # lower-case, очищаем от знаков
df_test['Review_Title'] = df_test['Review_Title'].apply(lambda x: ' '.join(re.findall(r'\w+', x.lower())))

In [80]:
df_test['Review_Text'] = df_test['Review_Text'].apply(my_lemmatizer) # лемматизируем и тексты, и заголовки
df_test['Review_Title'] = df_test['Review_Title'].apply(my_lemmatizer)

In [81]:
# выкидываем все филлеры
df_test['Review_Text'] = df_test['Review_Text'].apply(lambda x: ' '.join([word for word in x.split() if not word in sw_eng]))
df_test['Review_Title'] = df_test['Review_Title'].apply(lambda x: ' '.join([word for word in x.split() if not word in sw_eng]))

In [82]:
df_test['Pos_Words'] = df_test['Review_Text'].apply(count_pos) # Введем новые признаки - количество позитивных и негативных слов
df_test['Neg_Words'] = df_test['Review_Text'].apply(count_neg)

In [83]:
text_w = np.array([list(text2vec(row[1]['Review_Text'])) for row in df_test.iterrows()]) # получаем массив эмбеддингов
title_w = np.array([list(text2vec(row[1]['Review_Title'])) for row in df_test.iterrows()])

In [84]:
df_test = df_test.drop(['Review_Title', 'Review_Text', 'Id', 'Hotel_name'], axis=1) # названия отелей нам, впринципе, и не нужны
df_w = df_w.drop([0], axis=1)

In [85]:
df_test = pd.concat([df_test, pd.DataFrame(text_w)], axis=1) # добавляем эмбеддинги в конец датасета
df_test = pd.concat([df_test, pd.DataFrame(title_w)], axis=1)
df_test.columns = np.arange(df_test.shape[1])
df_w.columns = np.arange(df_w.shape[1])

In [86]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601
0,5,3,-0.085957,-0.0883,-0.025782,-0.0186,-0.057546,0.032532,-0.029811,-0.013036,-0.093857,-0.064461,-0.029739,0.226093,-0.046457,-0.030007,-0.046825,-0.050961,0.049625,-0.001243,0.006089,0.00225,0.002264,-0.041511,-0.014182,-0.053882,-0.089282,-0.036746,0.045082,0.121554,-0.025454,0.085254,-0.059668,-0.008289,0.018918,-0.101136,0.029146,-0.05325,0.072996,-0.119446,...,-0.23405,-0.22525,0.08885,-0.22845,-0.20435,-0.26395,-0.0485,0.0621,-0.4841,0.10095,0.21775,0.1764,0.14145,0.39795,0.06685,0.0805,-0.2576,0.4475,-0.2646,0.2998,0.1112,-0.11115,0.1345,-0.058,-0.1048,0.0805,0.0303,0.2035,0.1484,0.1304,0.09835,0.3766,-0.28645,0.28195,0.19745,0.1261,-0.1071,-0.2151,-0.2478,0.41985
1,2,0,0.021571,0.018029,-0.0928,-0.125529,-0.138771,-0.042043,-0.226329,0.222857,0.120986,-0.099057,0.011271,0.066971,0.115329,0.003014,-0.056357,0.013529,0.094714,-0.054971,0.1065,0.098829,0.012086,-0.0157,0.109914,0.018757,-0.035286,-0.140814,0.097057,0.037371,-0.050657,0.103271,-0.118914,0.049114,0.055314,-0.2401,0.119286,-0.125043,0.114043,0.019057,...,-0.1483,0.0418,-0.1664,0.1346,0.6086,0.1988,-0.0404,-0.1073,0.1719,0.1904,0.0034,-0.1169,0.0777,0.0088,0.1079,0.1031,0.0419,0.3609,0.0821,0.0957,-0.5945,0.1201,-0.06,0.1217,-0.0158,-0.2193,0.172,-0.2029,0.0287,-0.2593,0.0252,-0.1689,-0.1366,-0.4182,0.168,0.0267,-0.0217,0.3358,-0.0696,0.1499
2,5,0,-0.015844,-0.110565,-0.022041,0.040091,-0.148826,-0.008109,-0.026632,0.080856,0.131488,0.017588,0.048562,0.171194,0.018112,-0.034329,-0.0222,-0.028235,0.042747,0.050412,-0.039671,-0.005576,0.046241,-0.032721,0.030553,-0.008741,-0.087935,-0.058018,0.064259,0.038982,-0.011335,-0.003462,-0.09405,-0.019882,0.037409,-0.162844,0.046691,-0.062844,0.027715,0.024765,...,0.0206,-0.018075,0.026825,-0.019525,-0.0594,-0.0691,-0.036675,-0.158275,-0.11285,-0.05705,0.011175,0.1725,-0.0907,0.047425,0.1661,0.08165,0.030775,-0.028175,-0.099975,-0.07075,0.0208,0.10385,-0.018875,0.013925,-0.0893,-0.084075,-0.078525,-0.028075,0.01565,-0.070775,-0.10315,-0.04415,0.03955,0.054175,0.087875,-0.1239,-0.08835,0.02435,-0.08605,0.037475
3,1,1,0.071125,0.078262,-0.075513,-0.09105,-0.248125,-0.016412,-0.138887,0.279375,0.205263,-0.2076,-0.027463,0.195462,-0.008175,-0.118462,0.028563,-0.052725,0.00755,-0.086525,0.06925,-0.008213,0.039675,-0.072413,0.019575,-0.059987,-0.039337,-0.000525,0.001637,0.055962,-0.072337,0.143375,-0.017675,0.004963,-0.034438,-0.273525,0.087038,0.029075,-0.0383,-0.067575,...,0.1041,-0.15385,-0.0067,-0.1261,-0.07365,0.03085,-0.032,-0.18785,-0.05725,0.0276,0.22835,0.24125,0.17575,0.19435,0.14865,0.1277,-0.0638,0.0587,0.09745,0.16105,0.35305,0.0436,-0.19135,-0.28135,-0.002,0.0619,0.15625,0.08115,-0.22435,-0.19885,-0.13305,0.18075,-0.08995,-0.025,-0.18995,0.16465,0.0467,0.59165,-0.1084,0.0384
4,10,0,0.022224,0.01951,-0.002221,0.01051,-0.140479,-0.003862,-0.135876,0.070652,0.059483,-0.047155,-0.059648,0.213624,-0.032821,0.027679,-0.044538,-0.009407,0.014397,-0.035172,-0.011345,0.057462,0.066017,-0.033655,-0.018583,0.009924,-0.14031,-0.155038,0.071472,0.058821,-0.02551,-0.010383,-0.134469,-0.014331,0.048431,-0.187793,0.026666,-0.129172,0.062845,-0.097545,...,0.01902,-0.26228,-0.01084,0.07594,-0.04174,0.03624,0.04912,-0.0198,-0.2041,0.0251,-0.08064,0.14792,-0.033,0.05746,0.1113,0.02492,-0.17808,-0.04504,-0.00166,0.0552,0.06596,0.18246,0.05426,-0.04836,0.01518,-0.11624,-0.07998,0.01242,-0.16728,0.00234,0.00236,0.15584,-0.1058,-0.03882,-0.11984,0.06764,-0.10144,0.2142,-0.06212,-0.03928


In [102]:
cbr = CatBoostRegressor(random_seed=42, verbose=False) # обучение на лучшей модели
cbr.fit(df_w, y)
y_pred = cbr.predict(df_test)

In [106]:
submit = pd.DataFrame() # предсказание на тесте
submit['Id'] = id_col
submit['Rating'] = y_pred
submit.to_csv('submit.csv', index = False)