## Обучение модели

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# импортируем данные для обучения 
train = pd.read_csv('/Users/anastasiamyskina/Downloads/IT/train_vec.csv')
test = pd.read_csv('/Users/anastasiamyskina/Downloads/IT/test_vec.csv')

In [3]:
train_copy = train.copy()
train_copy=train_copy.drop(columns = ['score'], axis = 1) 

In [4]:
# разделим на train/test
X_train, X_test, y_train, y_test = train_test_split(train_copy, train['score'], test_size=.15, random_state=42)

In [5]:
from sklearn.metrics import accuracy_score, f1_score, ndcg_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from collections import Counter
from sklearn.preprocessing import LabelBinarizer
from sklearn import preprocessing


#### Выбор целевой метрики
Будем смотреть на метрику NDCG (Normalized Discounted Cumulative Gain), она хороша тем, что позволяет оценивать качество персонализированных рекомендаций. Она учитывает не только релевантность рекомендованных элементов, но и их порядок в списке рекомендаций.

### Проведение экспериментов

In [6]:
def modeling(model, X_train, y_train, X_test) -> None:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    lb = preprocessing.LabelBinarizer()
    y_test_bin = lb.fit_transform(y_test)
    y_pred_bin = lb.fit_transform(y_pred)
    print(report)
    print("NDCG метрика:")
    print(ndcg_score(y_test_bin, y_pred_bin))

#### Логистическая регрессия

In [7]:
modeling(LogisticRegression(random_state=42), X_train, y_train, X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

         0.0       0.37      0.45      0.41     13155
         1.0       0.23      0.15      0.18     13244
         2.0       0.22      0.11      0.15     13261
         3.0       0.21      0.03      0.05     13418
         4.0       0.26      0.65      0.37     13003

    accuracy                           0.28     66081
   macro avg       0.26      0.28      0.23     66081
weighted avg       0.26      0.28      0.23     66081

NDCG метрика:
0.6289632730211002


#### Решающее дерево

In [8]:
tree = DecisionTreeClassifier(random_state=42)

In [9]:
modeling(tree, X_train, y_train, X_test)

              precision    recall  f1-score   support

         0.0       0.27      0.27      0.27     13155
         1.0       0.21      0.21      0.21     13244
         2.0       0.20      0.20      0.20     13261
         3.0       0.21      0.21      0.21     13418
         4.0       0.23      0.24      0.24     13003

    accuracy                           0.23     66081
   macro avg       0.23      0.23      0.23     66081
weighted avg       0.23      0.23      0.23     66081

NDCG метрика:
0.6026286298950798


#### Бустинг

In [10]:
boost = GradientBoostingClassifier()

In [11]:
modeling(boost, X_train, y_train, X_test)

              precision    recall  f1-score   support

         0.0       0.34      0.55      0.42     13155
         1.0       0.23      0.16      0.19     13244
         2.0       0.22      0.13      0.16     13261
         3.0       0.23      0.12      0.16     13418
         4.0       0.29      0.46      0.35     13003

    accuracy                           0.28     66081
   macro avg       0.26      0.28      0.26     66081
weighted avg       0.26      0.28      0.26     66081

NDCG метрика:
0.63137708793775


#### Дерево решений

In [12]:
forest = RandomForestClassifier(n_estimators=30)

modeling(forest, X_train, y_train, X_test)

              precision    recall  f1-score   support

         0.0       0.31      0.37      0.34     13155
         1.0       0.21      0.21      0.21     13244
         2.0       0.21      0.20      0.20     13261
         3.0       0.22      0.20      0.21     13418
         4.0       0.25      0.24      0.25     13003

    accuracy                           0.24     66081
   macro avg       0.24      0.24      0.24     66081
weighted avg       0.24      0.24      0.24     66081

NDCG метрика:
0.6117250642433546


#### CatBoost

In [13]:
catboosting = CatBoostClassifier(metric_period=300)

In [14]:
modeling(catboosting, X_train, y_train, X_test)

Learning rate set to 0.107045
0:	learn: 1.5982775	total: 147ms	remaining: 2m 27s
300:	learn: 1.5422045	total: 11.6s	remaining: 27s
600:	learn: 1.5355908	total: 23.4s	remaining: 15.5s
900:	learn: 1.5296791	total: 36.1s	remaining: 3.96s
999:	learn: 1.5277700	total: 40.1s	remaining: 0us
              precision    recall  f1-score   support

         0.0       0.34      0.54      0.42     13155
         1.0       0.22      0.15      0.18     13244
         2.0       0.22      0.14      0.17     13261
         3.0       0.23      0.15      0.18     13418
         4.0       0.29      0.42      0.34     13003

    accuracy                           0.28     66081
   macro avg       0.26      0.28      0.26     66081
weighted avg       0.26      0.28      0.26     66081

NDCG метрика:
0.630399143630683


Лучше всего NDCG метрика оказалась у градиентного бустинга. Будем использовать его. Он позволяет снизить ошибку модели на каждой итерации, комбинируя решения базовых моделей (обычно деревьев решений). 

In [17]:
test=test.drop(columns = ['score'], axis = 1)

In [18]:
y_pred = boost.predict(test)

In [24]:
test['score']=y_pred
test.head()

Unnamed: 0,common_words,len_of_post,len_of_comm,len_of_text,ratio_post,post_text_svd,com_text_svd,score
0,1,8,16,24,0.125,0.003099,0.034419,3.0
1,0,8,10,18,0.0,0.003099,0.018142,4.0
2,1,8,20,28,0.125,0.003099,0.029268,2.0
3,2,8,25,33,0.25,0.003099,0.054525,2.0
4,0,8,4,12,0.0,0.003099,0.013465,4.0


In [22]:
#test.to_csv(r'/Users/anastasiamyskina/Downloads/IT/test_predict.csv', index= False )