In [7]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from colorama import Fore, Style
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from data.encode_data import encode_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,
                             recall_score,
                             precision_score,
                             f1_score,
                             roc_auc_score)

sns.set_theme(style='white')

In [8]:
current_directory = Path.cwd()
RAW_DATA_PATH = current_directory.parent / 'data/raw/affair_data.csv'

In [9]:
raw_data = pd.read_csv(RAW_DATA_PATH)
raw_data.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,1


In [10]:
data_encoded = encode_dataset(raw_data)
data_encoded.head()

Unnamed: 0,rate_marriage,children,religious,educ,affairs,age_17.5,age_22.0,age_27.0,age_32.0,age_37.0,...,occupation_3.0,occupation_4.0,occupation_5.0,occupation_6.0,occupation_husb_1.0,occupation_husb_2.0,occupation_husb_3.0,occupation_husb_4.0,occupation_husb_5.0,occupation_husb_6.0
0,3.0,3.0,3.0,17.0,1,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.0,3.0,1.0,14.0,1,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4.0,0.0,1.0,16.0,1,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,4.0,3.0,16.0,1,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5.0,1.0,1.0,14.0,1,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
X = data_encoded.drop(columns='affairs')
y = data_encoded['affairs']
    

def train_and_evaluate_models(X, y):
    # Разделение данных на обучающий и тестовый набoры
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Инициализация моделей
    models = {
        "Random Forest": RandomForestClassifier(),
        "Decision Tree": DecisionTreeClassifier(),
        "CatBoost": CatBoostClassifier(iterations=2000, verbose=0),
        "XGBoost": XGBClassifier()

    }

    # Инициализация списка для хранения результатов
    results = []
    
    # Обучение и оценка каждой модели
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        results.append({'Model': name, 'Accuracy': accuracy, 'Recall': recall,
                        'Precision': precision, 'AUC': auc, 'F1-score': f1})
    
    # Преобразование списка результатов в DataFrame
    results_df = pd.DataFrame(results)
    plt.figure(figsize=(10, 6))
    plt.axis('off')
    plt.table(cellText=results_df.values,
              colLabels=results_df.columns,
              cellLoc = 'center', loc='upper left')
    plt.tight_layout()
    plt.savefig('../reports/figures/model_results.png', dpi=300, bbox_inches='tight', pad_inches=0.5)
    plt.close()
    return results_df.sort_values(by='AUC', ascending=False)

In [14]:
train_and_evaluate_models(X, y)

Unnamed: 0,Model,Accuracy,Recall,Precision,AUC,F1-score
2,CatBoost,0.688383,0.403893,0.522013,0.613882,0.455418
3,XGBoost,0.672684,0.416058,0.491379,0.60548,0.450593
0,Random Forest,0.659341,0.403893,0.467606,0.592445,0.43342
1,Decision Tree,0.627943,0.394161,0.418605,0.566721,0.406015


Лучше всего себя показала NeuralNet. Тюнить и использовать будем её.