In [1]:
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
DATA_PATH = "/home/matv864/it/AI_work/data/billionaires_ready.csv"

In [3]:
df = pd.read_csv(DATA_PATH)

In [4]:
print(df.shape)
df = df.dropna(thresh=df.shape[1] - 1)
print(df.shape)

(2397, 21)
(2397, 21)


In [5]:
X_reg = df.drop(columns=['finalWorth'])
y_reg = df['finalWorth']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=777
)

scaler = StandardScaler()
X_train_reg_scaled = scaler.fit_transform(X_train_reg)
X_test_reg_scaled = scaler.transform(X_test_reg)

regression_models = {
    "Linear Regression": LinearRegression(),  # линейная комбинация
    "Ridge Regression": Ridge(alpha=1.0),  # штрафы -> сжимаем коэфы
    "Lasso Regression": Lasso(alpha=0.1)  # штраф может обнулить признак -> отбор признаков
}

regression_results = {}

for name, model in regression_models.items():
    model.fit(X_train_reg_scaled, y_train_reg)
    y_pred = model.predict(X_test_reg_scaled)
    
    # Mean Squared Error - чувствителен к выбросам
    mse = np.sqrt(mean_squared_error(y_test_reg, y_pred))

    # Mean Absolute Error - менее чувствителен к выбросам, чем MSE
    mae = mean_absolute_error(y_test_reg, y_pred)

    # R^2 - показывает, насколько хорошо модель объясняет дисперсию данных
    # =1 - идеальное
    # =0 - модель работает как простое среднее 
    # <0 - модель хуже, чем предсказание средним
    r2 = r2_score(y_test_reg, y_pred)

    print(name)
    pprint({
        "MSE": mse,
        "MAE": mae,
        "R2": r2
    })
    print("="*30)

Linear Regression
{'MAE': 3331.8156426943024,
 'MSE': np.float64(6814.921631405478),
 'R2': 0.2561849108037604}
Ridge Regression
{'MAE': 3335.6454662085653,
 'MSE': np.float64(6819.079369468108),
 'R2': 0.2552770407330718}
Lasso Regression
{'MAE': 3336.684104602315,
 'MSE': np.float64(6818.578427286166),
 'R2': 0.25538645416588024}


  model = cd_fast.enet_coordinate_descent(


In [6]:
X_cls = df.drop(columns=['is_male'])
y_cls = df['is_male']

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=777, stratify=y_cls
)

# бинарная классификация 
# выход - вероятность [0, 1]
logistic_regression_model = LogisticRegression(max_iter=1000, random_state=22)



logistic_regression_model.fit(X_train_cls, y_train_cls)
y_pred = logistic_regression_model.predict(X_test_cls)
y_pred_proba = logistic_regression_model.predict_proba(X_test_cls)[:, 1]

# доля правильный предсказаний среди всех
accuracy = accuracy_score(y_test_cls, y_pred)

# сколько из ответов "1" реально правильные
precision = precision_score(y_test_cls, y_pred)

# Полнота - какой процент "1" от реальных 
recall = recall_score(y_test_cls, y_pred)

# Баланс между Precision и Recall
f1 = f1_score(y_test_cls, y_pred)

# =1 - идеал
# =0.5 - как случайные угадывания
roc_auc = roc_auc_score(y_test_cls, y_pred_proba)
print("Logistic Regression")
pprint({
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1,
    "ROC-AUC": roc_auc
})
print("="*30)


Logistic Regression
{'Accuracy': 0.8583333333333333,
 'F1 Score': 0.9225512528473804,
 'Precision': 0.8901098901098901,
 'ROC-AUC': np.float64(0.5280577329849446),
 'Recall': 0.9574468085106383}
