# Ⅰ. 女性に対する不倫調査（1974年）

## 1. モジュール読込

In [None]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

import math

#プロット用
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# matplotlibの表示設定
%matplotlib inline

# 機械学習用
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

# 性能評価用
from sklearn import metrics

import statsmodels.api as sm

## 2. データ整形

### データの一部を表示

In [None]:
df = sm.datasets.fair.load_pandas().data
df.head(3)

Number of observations: 6366 Number of variables: 9 Variable name definitions:
* rate_marriage   : How rate marriage, 1 = very poor, 2 = poor, 3 = fair, 4 = good, 5 = very good
* age             : Age
* yrs_married     : No. years married. Interval approximations. See original paper for detailed explanation.
* children        : No. children
* religious       : How relgious, 1 = not, 2 = mildly, 3 = fairly, 4 = strongly
* educ            : Level of education, 9 = grade school, 12 = high school, 14 = some college, 16 = college graduate, 17 = some graduate school, 20 = advanced degree
* occupation      : 1 = student, 2 = farming, agriculture; semi-skilled, or unskilled worker; 3 = white-colloar; 4 = teacher counselor social worker, nurse; artist, writers; technician, skilled worker, 5 = managerial, administrative, business, 6 = professional with advanced degree
* occupation_husb : Husband's occupation. Same as occupation.
* affairs         : measure of time spent in extramarital affairs

In [None]:
# データの特性（個数、平均値、標準偏差、最大値、最小値、四分位数など）を表示
df.describe()

In [None]:
# affairsに値があれば浮気したとみなす
def affair_check(x):
    if x != 0:
        return 1
    else:
        return 0

# 列を追加
df['Had_Affair'] = df['affairs'].apply(affair_check)

df.head(3)

In [None]:
# 不倫の有無（Had_Affair列）でグループ分け
df.groupby('Had_Affair').mean()

### ヒストグラムで可視化

In [None]:
# 年齢
sns.countplot('age',data=df.sort_values(by='age'),hue='Had_Affair',palette='coolwarm')

In [None]:
# 夫婦歴
sns.countplot('yrs_married',data=df.sort_values(by='yrs_married'),hue='Had_Affair',palette='coolwarm')

In [None]:
# 子供の数
sns.countplot('children',data=df.sort_values(by='children'),hue='Had_Affair',palette='coolwarm')

In [None]:
# 学歴
sns.countplot('educ',data=df.sort_values(by='educ'),hue='Had_Affair',palette='coolwarm')

### ダミー変数に変換

In [None]:
# 妻の職業の種類を変換
occ_dummies = pd.get_dummies(df['occupation'])
occ_dummies.head()

In [None]:
# 列名を付与
occ_dummies.columns = ['occ1','occ2','occ3','occ4','occ5','occ6']
occ_dummies.head()

In [None]:
# 夫の職業にも同様の操作を加える
hus_occ_dummies = pd.get_dummies(df['occupation_husb'])
hus_occ_dummies.columns = ['hocc1','hocc2','hocc3','hocc4','hocc5','hocc6']
hus_occ_dummies.head()

In [None]:
# 不要になった職業の列と、目的変数「Had_Affair」を削除
X = df.drop(['occupation','occupation_husb','Had_Affair'],axis=1)
X.head(3)

In [None]:
# ダミー変数のDataFrameとして結合
dummies = pd.concat([occ_dummies,hus_occ_dummies],axis=1)

# 説明変数Xとダミー変数を結合
X = pd.concat([X,dummies],axis=1)

X.head(3)

In [None]:
# 多重共線性を考慮
X = X.drop('occ1',axis=1)
X = X.drop('hocc1',axis=1)

# 目的変数を削除
X = X.drop('affairs',axis=1)

X.head(3)

In [None]:
# Yに目的変数を格納
Y = df.Had_Affair

Y.head(3)

## 3. モデル作成

In [None]:
# インスタンス作成
log_model = LogisticRegression()

# データを代入して学習
log_model.fit(X,Y)

## 4. 評価

In [None]:
# モデルの精度を表示
log_model.score(X,Y)

## 5. 結果

In [None]:
# 変数名と係数を格納
# 係数が正 -> その変数が増えれば、不倫の可能性は増加
# 係数が負 -> その変数が増えれば、不倫の可能性は減少
coeff_df = DataFrame([X.columns, log_model.coef_[0]]).T
coeff_df

## 6. おまけ

In [None]:
# 学習用とテスト用データの作成
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# インスタンス作成
log_model_extra = LogisticRegression()

# 学習
log_model_extra.fit(X_train, Y_train)

# テスト用データで予測
result = log_model_extra.predict(X_test)

# 精度
metrics.accuracy_score(Y_test,result)