In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.metrics import confusion_matrix, mean_squared_error, f1_score, precision_score, accuracy_score, recall_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

In [3]:
def prepare_adult_data():
    adult = pd.read_csv('../datasets/adult.data',
                        names=['age', 'workclass', 'fnlwgt', 'education',
                               'education-num', 'marital-status', 'occupation',
                               'relationship', 'race', 'sex', 'capital-gain',
                               'capital-loss', 'hours-per-week', 'native-country', 'salary'])
    
    # Избавиться от лишних признаков
    adult.drop(['native-country'], axis=1, inplace=True)
    # Сконвертировать целевой столбец в бинарные значения
    adult['salary'] = (adult['salary'] != ' <=50K').astype('int32')
    # Сделать one-hot encoding для некоторых признаков
    adult = pd.get_dummies(adult, columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex'])
    # Нормализовать нуждающиеся в этом признаки
    a_features = adult[['age', 'education-num', 'hours-per-week', 'fnlwgt', 'capital-gain', 'capital-loss']]
    norm_features = (a_features - a_features.mean(axis=0)) / a_features.std(axis=0)
    adult.loc[:, ['age', 'education-num', 'hours-per-week', 'fnlwgt', 'capital-gain', 'capital-loss']] = norm_features
    
    # Разбить таблицу данных на матрицы X и y
    X = adult[list(set(adult.columns) - set(['salary']))]
    y = adult['salary']

    # Добавить фиктивный столбец единиц (bias линейной модели)
    X = np.hstack([np.ones(X.shape[0])[:, np.newaxis], X])
    
    return X, y

In [4]:
X, y = prepare_adult_data()
X_train, X_val, Y_train, Y_val = train_test_split(X, y, test_size=0.2)

In [5]:
model = LogisticRegression()
model.fit(X, y)
Y_pred = model.predict(X)
print(f1_score(y,Y_pred))
confusion_matrix(y, Y_pred)

0.6621280179674339


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[23030,  1690],
       [ 3124,  4717]])

In [6]:
#model = LogisticRegression()
#model.fit(X_train, Y_train)

#Y_predicted = model.predict(X_val)
#print(accuracy_score(Y_val,Y_predicted))
#print(precision_score(Y_val,Y_predicted))
#print(recall_score(Y_val,Y_predicted))
#print(f1_score(Y_val,Y_predicted))


In [15]:
# Отрисовать ROC кривую
def sigmoid(X, theta):
        return 1. / (1. + np.exp(-X.dot(theta)))

def calc_and_plot_roc(y_true, y_pred_proba):
    # Посчитать значения ROC кривой и значение площади под кривой AUC
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)
    
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
    plt.title('Receiver Operating Characteristic', fontsize=15)
    plt.xlabel('False positive rate (FPR)', fontsize=15)
    plt.ylabel('True positive rate (TPR)', fontsize=15)
    plt.legend(fontsize=15)

In [16]:
Y_pred_proba = sigmoid(X, Y_pred)
calc_and_plot_roc(y, Y_pred_proba)

ValueError: shapes (32561,67) and (32561,) not aligned: 67 (dim 1) != 32561 (dim 0)