In [3]:
#参考サイト　https://qiita.com/y_itoh/items/6a67fc4a1a6930f0a4b5
#SKLを用いたロジスティック回帰モデル

# データ加工・処理・分析ライブラリ
import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd

# 可視化ライブラリ
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# 機械学習ライブラリ
import sklearn

# 小数第3位まで表示
%precision 3

'%.3f'

In [4]:
# データを取得
url = 'https://raw.githubusercontent.com/yumi-ito/datasets/master/datasets_adult.csv'

# 取得したデータをDataFrameオブジェクトとして読み込み
df = pd.read_csv(url, header=None)

# 列ラベルを設定
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
              'marital-status', 'occupation', 'relationship', 'race', 'sex', 
              'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'flg-50K']

In [5]:
# データの形式と欠損数を出力
print(df.shape)
print('欠損の数：{}'.format(df.isnull().sum().sum()))

# データの先頭5行を出力
df.head()

(32561, 15)
欠損の数：0


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,flg-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
# 目的変数の各項目の個数を確認
df.groupby('flg-50K').size()

flg-50K
 <=50K    24720
 >50K      7841
dtype: int64

In [7]:
# カラムを追加し、1または0のフラグを設定
df['fin-flg'] = df['flg-50K'].map(lambda x: 1 if x ==' >50K' else 0)
print(df)

# 各項目の個数を再確認
df.groupby('fin-flg').size()

       age          workclass  fnlwgt    education  education-num  \
0       39          State-gov   77516    Bachelors             13   
1       50   Self-emp-not-inc   83311    Bachelors             13   
2       38            Private  215646      HS-grad              9   
3       53            Private  234721         11th              7   
4       28            Private  338409    Bachelors             13   
...    ...                ...     ...          ...            ...   
32556   27            Private  257302   Assoc-acdm             12   
32557   40            Private  154374      HS-grad              9   
32558   58            Private  151910      HS-grad              9   
32559   22            Private  201490      HS-grad              9   
32560   52       Self-emp-inc  287927      HS-grad              9   

            marital-status          occupation    relationship    race  \
0            Never-married        Adm-clerical   Not-in-family   White   
1       Married-civ-spo

fin-flg
0    24720
1     7841
dtype: int64

In [8]:
# ロジスティック回帰のモデル構築のためのインポート
from sklearn.linear_model import LogisticRegression

# データ分割（訓練データとテストデータ）のためのインポート
from sklearn.model_selection import train_test_split

In [9]:
# 説明変数と目的変数の設定
x = df[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = df['fin-flg']

In [10]:
# 訓練データとテストデータに分割
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.5, random_state=0)

In [11]:
# LogisticRegressionクラスの初期化
model = LogisticRegression()

# 学習の実行
model.fit(X_train, Y_train)

LogisticRegression()

In [12]:
# 訓練データの正解率
train_score = format(model.score(X_train, Y_train))
print('正解率(train):', train_score)

# テストデータの正解率
test_score = format(model.score(X_test, Y_test))
print('正解率(test):', test_score)

正解率(train): 0.7971130221130222
正解率(test): 0.7979853817333088


In [13]:
# 学習済みモデルの各変数の係数を取得
regression_coefficient = model.coef_

# 行ラベル･列ラベルを付与してDataFrameに変換
pd.DataFrame(regression_coefficient.T,
             index = [x.columns.values],
             columns = ['回帰係数'])

Unnamed: 0,回帰係数
age,-0.011855
fnlwgt,-4e-06
education-num,-0.002774
capital-gain,0.000327
capital-loss,0.000753


In [15]:
# 学習済みモデルの各変数のオッズ比を取得
odds_ratio = np.exp(model.coef_)

# 行ラベル･列ラベルを付与してDataFrameに変換
pd.DataFrame(np.exp(model.coef_).T,
             index = [x.columns.values],
             columns = ['オッズ比'])

Unnamed: 0,オッズ比
age,0.988215
fnlwgt,0.999996
education-num,0.99723
capital-gain,1.000327
capital-loss,1.000754
