# 3. Modeling

`from sklearn.models import sdelat_horoscho`

Сначала загрузим итоговый датасет:

In [34]:
from pathlib import Path

import pandas as pd

SEED = 42
DATA_DIR = Path("../preprocessed_data")

df = pd.read_csv(DATA_DIR / "augmented_merged.csv")

In [35]:
target_names = [col for col in df.columns if col != "rate_name"]
target_names

['class',
 'quality',
 'bathroom',
 'bedding',
 'capacity',
 'club',
 'balcony',
 'view']

Разобъём датасет на обучающую и тестовую выборки:

In [40]:
from sklearn.model_selection import train_test_split


def split_data(
    df_to_split: pd.DataFrame, target_name: str, test_size=0.2, random_state=SEED
):
    X = df_to_split["rate_name"]
    y = df_to_split[target_name]

    return train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

Сделаем sklearn pipeline для векторизации и классификации:

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report


def get_pipeline(classifier):
    return Pipeline(
        [
            (
                "tfidf",
                TfidfVectorizer(
                    min_df=2,
                    sublinear_tf=True,
                    stop_words="english",
                ),
            ),
            ("clf", classifier),
        ]
    )


def train_classifier(classifier, X_train, y_train):
    pipeline = get_pipeline(classifier)
    return pipeline.fit(X_train, y_train)


def evaluate_pipeline(pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    score = f1_score(y_test, y_pred, average="macro")
    print(f"Metrics report for {pipeline.steps[-1][1]} predicting {y_test.name}")
    print(f"F1 score: {score}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print()
    return score

In [55]:
def get_dummy_classifier():
    return DummyClassifier(strategy="stratified", random_state=SEED)

def get_dummy_classifier_most_frequent():
    return DummyClassifier(strategy="most_frequent", random_state=SEED)

def get_nb_classifier():
    return ComplementNB()

def get_linear_classifier():
    return RidgeClassifier()

def get_linear_svc_classifier():
    return LinearSVC()

In [56]:
X_train, X_test, y_train, y_test = split_data(df, target_names[0])
pipe = train_classifier(get_dummy_classifier(), X_train, y_train)
print("Количество признаков:", len(pipe.steps[0][1].get_feature_names_out()))

Количество признаков: 3109


In [59]:
def check_one_classifier_on_one_target(classifier_getter, target_name, report_dict):
    X_train, X_test, y_train, y_test = split_data(df, target_name)
    pipe = train_classifier(classifier_getter(), X_train, y_train)
    score = evaluate_pipeline(pipe, X_test, y_test)
    report_dict[target_name] = score

def check_one_classifier(classifier_getter, report_dict):
    classifier_dict = {}
    for target_name in target_names:
        check_one_classifier_on_one_target(classifier_getter, target_name, classifier_dict)
    report_dict[str(classifier_getter())] = classifier_dict

Сравним простые модели на классификации н наших данных:

In [61]:
report = {}
for clf_getter in [get_dummy_classifier, get_dummy_classifier_most_frequent, get_nb_classifier, get_linear_classifier, get_linear_svc_classifier]:
    print("-" * 40)
    check_one_classifier(clf_getter, report)

report_df = pd.DataFrame(report)
report_df.head()

----------------------------------------
Metrics report for DummyClassifier(random_state=42, strategy='stratified') predicting class
F1 score: 0.07164090903684332
              precision    recall  f1-score   support

   apartment       0.04      0.04      0.04       661
    bungalow       0.01      0.01      0.01        68
     camping       0.00      0.00      0.00         3
     capsule       0.00      0.00      0.00        28
      chalet       0.00      0.00      0.00        19
     cottage       0.02      0.02      0.02        58
        dorm       0.02      0.02      0.02       262
junior-suite       0.02      0.02      0.02       399
        room       0.76      0.76      0.76     11479
run-of-house       0.00      0.00      0.00        11
      studio       0.04      0.04      0.04       584
       suite       0.08      0.08      0.08      1255
        tent       0.00      0.00      0.00        10
       villa       0.01      0.02      0.01       192

    accuracy             

Unnamed: 0,"DummyClassifier(random_state=42, strategy='stratified')","DummyClassifier(random_state=42, strategy='most_frequent')",ComplementNB(),RidgeClassifier(),LinearSVC()
class,0.071641,0.061863,0.801251,0.96992,0.98657
quality,0.050188,0.031582,0.885078,0.94101,0.980896
bathroom,0.251805,0.245986,0.531797,0.910097,0.992634
bedding,0.198741,0.152548,0.37784,0.514779,0.634893
capacity,0.141057,0.109028,0.594865,0.825725,0.887513


Сохраним две лучшие модели для дальнейших экспериментов:

In [63]:
import joblib
from tqdm import tqdm

MODELS_DIR = Path("../models")

for target_name in tqdm(target_names):
    X_train, X_test, y_train, y_test = split_data(df, target_name)
    ridge_pipe = train_classifier(get_linear_classifier(), X_train, y_train)
    joblib.dump(ridge_pipe, MODELS_DIR / f"ridge_pipeline_{target_name}.pkl")
    
    svc_pipe = train_classifier(get_linear_svc_classifier(), X_train, y_train)
    joblib.dump(svc_pipe, MODELS_DIR / f"svc_pipeline_{target_name}.pkl")

100%|██████████| 8/8 [00:29<00:00,  3.74s/it]
