# Logistic Regression Practice

In [28]:
import numpy as np
import pandas as pd
import warnings
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
warnings.filterwarnings('ignore')

## データセット作成

In [29]:
df = make_classification(
    n_samples = 100000,
    n_features = 10,
    n_informative = 2,          # 目的変数と相関の強い特徴量の数（特徴量選択で選ばれる確率が高い）
    n_redundant = 0,            # n_informativeの線形結合で作成される特徴量の数（多重共線性を含める）
    n_repeated = 0,             # n_informative,n_redundantのコピー特徴量の数
    n_classes = 2,              # 2の場合2値分類,n>2の場合多値分類
    n_clusters_per_class = 2,
    weights = [0.9999, 0.0001], # 100000のうち1のラベルデータが10個しか生成されない
    flip_y = 0,                 # 0.01で1%の確率でクラスラベルが切り替わる（異常値づくり？）
    class_sep = 1.0,            # 切片の学習用
    hypercube = True,
    shift = 0.0,
    scale = 1.0,
    shuffle = True,
    random_state = 71)          # 生成データのランダム性（Noneにしたら毎回異なるデータ）

In [30]:
df_raw = pd.DataFrame(df[0], columns =
                      ['var1', 'var2', 'var3', 'var4', 'var5',
                       'var6', 'var7', 'var8', 'var9', 'var10'])
df_raw['Class'] = df[1]
X = df_raw.iloc[:, 0:10]
y = df_raw['Class']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## モデル作成と予測,精度評価

In [32]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.9999666666666667


In [27]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[29999     0]
 [    1     0]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     29999
           1       0.00      0.00      0.00         1

   micro avg       1.00      1.00      1.00     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       1.00      1.00      1.00     30000

