In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

data_df = pd.read_csv('adult.data.csv', header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'over'])
data_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,over
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [23]:
print('train 데이터 정보')
print(data_df.info())

train 데이터 정보
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  over            32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [24]:
data_df.drop('fnlwgt', axis=1, inplace=True)
data_df.drop('education-num', axis=1, inplace=True)

In [27]:
print(' age 값 분포 :\n',data_df['age'].value_counts())
print(' workclass 값 분포 :\n',data_df['workclass'].value_counts())
print(' education 값 분포 :\n',data_df['education'].value_counts())
print(' marital-status 값 분포 :\n',data_df['marital-status'].value_counts())
print(' occupation 값 분포 :\n',data_df['occupation'].value_counts())
print(' relationship 값 분포 :\n',data_df['relationship'].value_counts())
print(' race 값 분포 :\n',data_df['race'].value_counts())
print(' sex 값 분포 :\n',data_df['sex'].value_counts())
print(' native-country 값 분포 :\n',data_df['native-country'].value_counts())
print(' over 값 분포 :\n',data_df['over'].value_counts())

 age 값 분포 :
 36    898
31    888
34    886
23    877
35    876
     ... 
83      6
85      3
88      3
86      1
87      1
Name: age, Length: 73, dtype: int64
 workclass 값 분포 :
 4    22696
6     2541
2     2093
0     1836
7     1298
5     1116
1      960
8       14
3        7
Name: workclass, dtype: int64
 education 값 분포 :
 11    10501
15     7291
9      5355
12     1723
8      1382
1      1175
7      1067
0       933
5       646
14      576
6       514
2       433
10      413
4       333
3       168
13       51
Name: education, dtype: int64
 marital-status 값 분포 :
 2    14976
4    10683
0     4443
5     1025
6      993
3      418
1       23
Name: marital-status, dtype: int64
 occupation 값 분포 :
 10    4140
3     4099
4     4066
1     3770
12    3650
8     3295
7     2002
0     1843
14    1597
6     1370
5      994
13     928
11     649
9      149
2        9
Name: occupation, dtype: int64
 relationship 값 분포 :
 0    13193
1     8305
3     5068
4     3446
5     1568
2      981
Name: relati

In [25]:
from sklearn import preprocessing

def encode_features(dataDF):
    features = ['workclass', 'education', 'marital-status','occupation', 'relationship', 'race', 'sex', 'native-country', 'over']
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
        
    return dataDF

data_df = encode_features(data_df)
data_df.head()
        

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,over
0,39,7,9,4,1,1,4,1,2174,0,40,39,0
1,50,6,9,2,4,0,4,1,0,0,13,39,0
2,38,4,11,0,6,1,4,1,0,0,40,39,0
3,53,4,1,2,6,0,2,1,0,0,40,39,0
4,28,4,9,2,10,5,2,0,0,0,40,5,0


 age 값 분포 :
 36    898
31    888
34    886
23    877
35    876
     ... 
83      6
85      3
88      3
86      1
87      1
Name: age, Length: 73, dtype: int64
 workclass 값 분포 :
 4    22696
6     2541
2     2093
0     1836
7     1298
5     1116
1      960
8       14
3        7
Name: workclass, dtype: int64
 education 값 분포 :
 11    10501
15     7291
9      5355
12     1723
8      1382
1      1175
7      1067
0       933
5       646
14      576
6       514
2       433
10      413
4       333
3       168
13       51
Name: education, dtype: int64
 marital-status 값 분포 :
 2    14976
4    10683
0     4443
5     1025
6      993
3      418
1       23
Name: marital-status, dtype: int64
 occupation 값 분포 :
 10    4140
3     4099
4     4066
1     3770
12    3650
8     3295
7     2002
0     1843
14    1597
6     1370
5      994
13     928
11     649
9      149
2        9
Name: occupation, dtype: int64
 relationship 값 분포 :
 0    13193
1     8305
3     5068
4     3446
5     1568
2      981
Name: relati

In [16]:
data_df.groupby(['workclass','over'])['over'].count()

workclass  over
0          0        1645
           1         191
1          0         589
           1         371
2          0        1476
           1         617
3          0           7
4          0       17733
           1        4963
5          0         494
           1         622
6          0        1817
           1         724
7          0         945
           1         353
8          0          14
Name: over, dtype: int64

workclass,
occupation,
native-country
에 ?가 있음

In [17]:
print(' age 값 분포 :\n',data_df['age'].value_counts())
print(' workclass 값 분포 :\n',data_df['workclass'].value_counts())
print(' education 값 분포 :\n',data_df['education'].value_counts())
print(' marital-status 값 분포 :\n',data_df['marital-status'].value_counts())
print(' occupation 값 분포 :\n',data_df['occupation'].value_counts())
print(' relationship 값 분포 :\n',data_df['relationship'].value_counts())
print(' race 값 분포 :\n',data_df['race'].value_counts())
print(' sex 값 분포 :\n',data_df['sex'].value_counts())
print(' native-country 값 분포 :\n',data_df['native-country'].value_counts())
print(' over 값 분포 :\n',data_df['over'].value_counts())


 age 값 분포 :
 36    898
31    888
34    886
23    877
35    876
     ... 
83      6
85      3
88      3
86      1
87      1
Name: age, Length: 73, dtype: int64
 workclass 값 분포 :
 4    22696
6     2541
2     2093
0     1836
7     1298
5     1116
1      960
8       14
3        7
Name: workclass, dtype: int64
 education 값 분포 :
 11    10501
15     7291
9      5355
12     1723
8      1382
1      1175
7      1067
0       933
5       646
14      576
6       514
2       433
10      413
4       333
3       168
13       51
Name: education, dtype: int64
 marital-status 값 분포 :
 2    14976
4    10683
0     4443
5     1025
6      993
3      418
1       23
Name: marital-status, dtype: int64
 occupation 값 분포 :
 10    4140
3     4099
4     4066
1     3770
12    3650
8     3295
7     2002
0     1843
14    1597
6     1370
5      994
13     928
11     649
9      149
2        9
Name: occupation, dtype: int64
 relationship 값 분포 :
 0    13193
1     8305
3     5068
4     3446
5     1568
2      981
Name: relati