In [1]:
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,data-story-long,rating,meta-rating,author_id,comments,dtm,views,saves,author_name,title,tags,text_lem,text_stem
0,0,7337284,True,7727.0,7884:157,1002684,479,2020-04-02 15:12:18+03:00,,181,ksenobianinSanta,Обычная история об обычном негодяе,"['Коронавирус', 'Безответственность', 'Длинноп...",столичный ресторатор илья эстер сказочно прово...,столичн ресторатор ил эстер сказочн провод 8 м...
1,1,7337282,True,15.0,25:10,2008181,6,2020-04-02 15:11:33+03:00,,4,Merey202,Гэбриел Бирн,"['Биография', 'Ирландия', 'Актеры и актрисы', ...",ирландец нести черный пес печаль плечо бе...,ирландец нест черн пес печал плеч бешен нрав г...
2,2,7337254,True,28.0,39:11,2522374,2,2020-04-02 15:01:49+03:00,,11,Cat.Cat,Ещё немного о трибунате в Римской империи,"['[моё]', 'Cat_cat', 'История', 'Рим', 'Римска...",обсуждение пикабу пост особенность заклады...,обсужден пикаб пост особен закладыва цезар авг...
3,3,7337245,True,6.0,18:12,3073815,13,2020-04-02 14:58:46+03:00,,0,vitaminys2,Эд Бакканский. Враг-государство,"['Украина', 'Коронавирус', 'Государство', 'Дли...",сегодня видеть пол нервно озираться сторо...,сегодн видет пол нервн озира сторон торгова до...
4,4,7337246,True,21.0,28:7,3156206,8,2020-04-02 14:58:20+03:00,,5,ZloyElvis,Кнопки для проведения викторины,"['[моё]', 'Arduino', 'Электроника', 'Самоделки...",приветствовать данный длиннопост изготовлени...,приветствова дан длинноп изготовлен оборудован...


### 1. Лемматизация

In [4]:
df_lem = df[['text_lem', 'rating']].dropna()

In [5]:
X, y = df_lem['text_lem'], df_lem['rating']

In [6]:
vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.7)
X_countVectorizer = vectorizer.fit_transform(X).toarray()
X_countVectorizer.shape

(132849, 1000)

In [7]:
tfidfconverter = TfidfTransformer()
X_tfIdf = tfidfconverter.fit_transform(X_countVectorizer).toarray()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_tfIdf, y, test_size=0.2, random_state=0)

#### 1. 1. Линейная регрессия

In [45]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [46]:
mean_squared_error(y_test, y_pred)

1160901.8876754607

In [47]:
mean_absolute_error(y_test, y_pred)

407.3855941394469

#### 1. 2. XGBoost

In [48]:
model = XGBRegressor()
model.fit(X_train, y_train, verbose=False)
y_pred_xgboost = model.predict(X_test)

In [49]:
mean_squared_error(y_test, y_pred_xgboost)

1231175.5079510491

In [50]:
mean_absolute_error(y_test, y_pred_xgboost)

406.3181802848014

#### 1.3. CatBoost

In [9]:
model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=8, verbose=False)
model.fit(X_train, y_train)
y_pred_catboost = model.predict(X_test)

In [10]:
mean_squared_error(y_test, y_pred_catboost)

1170181.4235929842

In [11]:
mean_absolute_error(y_test, y_pred_catboost)

397.4827944395817

In [12]:
model_name = "catBoost.model"
model.save(model_name)

AttributeError: 'CatBoostRegressor' object has no attribute 'save'

#### 1.4. RandomForestRegressor

In [54]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)
y_pred_forest = regr.predict(X_test)

In [55]:
mean_squared_error(y_test, y_pred_forest)

1191685.9951006488

In [56]:
mean_absolute_error(y_test, y_pred_forest)

408.5088306363602

### 2. Стемминг

In [58]:
df_stem = df[['text_stem', 'rating']].dropna()

In [59]:
X, y = df_stem['text_stem'], df_stem['rating']

In [60]:
vectorizer = CountVectorizer(max_features=1000, min_df=5, max_df=0.7)
X_countVectorizer = vectorizer.fit_transform(X).toarray()
X_countVectorizer.shape

(132849, 1000)

In [61]:
tfidfconverter = TfidfTransformer()
X_tfIdf = tfidfconverter.fit_transform(X_countVectorizer).toarray()

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_tfIdf, y, test_size=0.2, random_state=0)

#### 2.1.  Линейная регрессия

In [63]:
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [74]:
mean_squared_error(y_test, y_pred)

1158799.2437284607

In [64]:
mean_absolute_error(y_test, y_pred)

407.87784601666476

#### 2. 2. XGBoost

In [65]:
model = XGBRegressor()
model.fit(X_train, y_train, verbose=False)
y_pred_xgboost = model.predict(X_test)

In [66]:
mean_squared_error(y_test, y_pred_xgboost)

1212391.9091024338

In [67]:
mean_absolute_error(y_test, y_pred_xgboost)

403.4024438374712

#### 2.3. CatBoost

In [68]:
model.fit(X_train, y_train)
y_pred_catboost = model.predict(X_test)

In [69]:
mean_squared_error(y_test, y_pred_catboost)

1168818.6541218676

In [70]:
mean_absolute_error(y_test, y_pred_catboost)

397.71569585067414

#### 2.4. RandomForestRegressor

In [71]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)
y_pred_forest = regr.predict(X_test)

In [72]:
mean_squared_error(y_test, y_pred_forest)

1191177.760735872

In [73]:
mean_absolute_error(y_test, y_pred_forest)

408.2917829928213