In [440]:
# 导包
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from lightgbm import LGBMClassifier

In [441]:
# 读取数据
# 根据题意，先把header设置好
headers = ['age', 'workclass', 'fnlwgt',
           'education', 'education_num', 'marital_status',
           'occupation', 'relationship', 'race',
           'sex', 'capital_gain', 'capital_loss',
           'hours_per_week', 'native_country', 'income']

# 读取训练集进入内存
train_data = pd.read_csv("data/adult_train.csv",names=headers)

# 对于测试集，他的第一行是一个分隔，不读入
test_data = pd.read_csv("data/adult_test.csv",names=headers,skiprows=[0])

In [442]:
# 看看训练集有多少数据
train_data.shape

(32561, 15)

In [443]:
# 看看数据是怎么样的，平均值；最大最小值，标准差等等（只有连续性的）
train_data.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [444]:
# 数据中有一些值是？，用python的NaN代替方便以后直接当作空值处理
train_data = train_data.replace('[?]', np.NaN, regex=True)
test_data = test_data.replace('[?]', np.NaN, regex=True)
# 看看数据里有NaN的各类有多少
train_data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

In [445]:
# 对于缺失属性的数据直接删除
train_data['native_country'] = train_data['native_country'].fillna(' United-States')
train_data = train_data.dropna()
train_data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [446]:
# 开始处理离散数据
# income我们需要将其映射一下
# 先将>=50K的映射成0，<=50K的设置为1
income_map = {' <=50K':1,' >50K':0}
income_map2 = {' <=50K.':1, ' >50K.':0}
train_data['income'] = train_data['income'].map(income_map).astype(int)
test_data['income'] = test_data['income'].map(income_map2).astype(int)


In [447]:
# 取出所有的离散量属性
discrete = [x for x in train_data.columns if train_data[x].dtype=='object']
discrete

['workclass',
 'education',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native_country']

In [448]:
# 看看这些离散量属性的取值分布
for i in discrete:
    print(train_data[i].value_counts())
    

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
Name: workclass, dtype: int64
 HS-grad         9968
 Some-college    6775
 Bachelors       5182
 Masters         1675
 Assoc-voc       1321
 11th            1056
 Assoc-acdm      1020
 10th             831
 7th-8th          573
 Prof-school      558
 9th              463
 Doctorate        398
 12th             393
 5th-6th          303
 1st-4th          156
 Preschool         46
Name: education, dtype: int64
 Married-civ-spouse       14339
 Never-married             9912
 Divorced                  4258
 Separated                  959
 Widowed                    840
 Married-spouse-absent      389
 Married-AF-spouse           21
Name: marital_status, dtype: int64
 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Othe

In [449]:
# 发现有一些是比较接近重叠的，将这些重叠的归到同一类
# 大致可以把gov有关的一类，Private一类，Self-emp一类，没工作的一类
train_data['workclass'] = train_data['workclass'].replace([' Self-emp-not-inc', ' Self-emp-inc'],' Self-emp')
train_data['workclass'] = train_data['workclass'].replace([' Federal-gov', ' Local-gov', ' State-gov'], ' Gov')
train_data['workclass'] = train_data['workclass'].replace([' Without-pay', ' Never-worked'], ' Un-emp')
train_data['workclass'].value_counts()                                                         

test_data['workclass'] = test_data['workclass'].replace([' Self-emp-not-inc', ' Self-emp-inc'],' Self-emp')
test_data['workclass'] = test_data['workclass'].replace([' Federal-gov', ' Local-gov', ' State-gov'], ' Gov')
test_data['workclass'] = test_data['workclass'].replace([' Without-pay', ' Never-worked'], ' Un-emp')

In [450]:
# 同理对marital_status进行归类
train_data['marital_status'] = train_data['marital_status'].replace([' Divorced',' Married-spouse-absent',' Never-married',' Separated',' Widowed'],' Single')
train_data['marital_status'] = train_data['marital_status'].replace([' Married-civ-spouse', ' Married-AF-spouse'], ' Couple')
train_data['marital_status'].value_counts()

test_data['marital_status'] = test_data['marital_status'].replace([' Divorced',' Married-spouse-absent',' Never-married',' Separated',' Widowed'],' Single')
test_data['marital_status'] = test_data['marital_status'].replace([' Married-civ-spouse', ' Married-AF-spouse'], ' Couple')

In [451]:
# native_country这个分类太多了，而且很多类的人很少，干脆把人少的都归到other里
train_data['native_country'] = train_data['native_country'].replace([' Holand-Netherlands',' Scotland', ' Honduras', 
                                                                     ' Hungary', ' Outlying-US(Guam-USVI-etc)', ' Yugoslavia', 
                                                                     ' Laos', ' Thailand', ' Cambodia', 
                                                                     ' Trinadad&Tobago', ' Hong', ' Ireland', 
                                                                     ' France',' Ecuador', ' Greece', ' Peru', 
                                                                     ' Nicaragua', ' Portugal', ' Iran', 
                                                                     ' Taiwan', ' Haiti'], ' Other')

test_data['native_country'] = test_data['native_country'].replace([' Holand-Netherlands',' Scotland', ' Honduras', 
                                                                     ' Hungary', ' Outlying-US(Guam-USVI-etc)', ' Yugoslavia', 
                                                                     ' Laos', ' Thailand', ' Cambodia', 
                                                                     ' Trinadad&Tobago', ' Hong', ' Ireland', 
                                                                     ' France',' Ecuador', ' Greece', ' Peru', 
                                                                     ' Nicaragua', ' Portugal', ' Iran', 
                                                                     ' Taiwan', ' Haiti'], ' Other')
train_data['native_country'].value_counts()

 United-States         28060
 Mexico                  610
 Other                   486
 Philippines             188
 Germany                 128
 Puerto-Rico             109
 Canada                  107
 El-Salvador             100
 India                   100
 Cuba                     92
 England                  86
 Jamaica                  80
 South                    71
 Italy                    68
 China                    68
 Dominican-Republic       67
 Vietnam                  64
 Guatemala                63
 Japan                    59
 Poland                   56
 Columbia                 56
Name: native_country, dtype: int64

In [452]:
# 最后，看看education_num这个连续量
train_data['education'].value_counts()

 HS-grad         9968
 Some-college    6775
 Bachelors       5182
 Masters         1675
 Assoc-voc       1321
 11th            1056
 Assoc-acdm      1020
 10th             831
 7th-8th          573
 Prof-school      558
 9th              463
 Doctorate        398
 12th             393
 5th-6th          303
 1st-4th          156
 Preschool         46
Name: education, dtype: int64

In [453]:
# 发现跟education一样的，重复了，因为eudcation是离散的，不好处理。去掉这个属性
train_data = train_data.drop(columns=['education'])
test_data = test_data.drop(columns=['education'])

In [454]:
# 看看相关系数矩阵,检查一下连续变量。发现序号属性不太影响最后的收入
train_data.corr()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income
age,1.0,-0.07654,0.043567,0.080392,0.060409,0.101879,-0.242431
fnlwgt,-0.07654,1.0,-0.043509,-0.000189,-0.010011,-0.02281,0.009446
education_num,0.043567,-0.043509,1.0,0.124247,0.079321,0.151241,-0.33464
capital_gain,0.080392,-0.000189,0.124247,1.0,-0.032332,0.079548,-0.221871
capital_loss,0.060409,-0.010011,0.079321,-0.032332,1.0,0.053961,-0.151461
hours_per_week,0.101879,-0.02281,0.151241,0.079548,0.053961,1.0,-0.228547
income,-0.242431,0.009446,-0.33464,-0.221871,-0.151461,-0.228547,1.0


In [455]:
# 把序号属性删掉
train_data = train_data.drop(columns=['fnlwgt'])
test_data = test_data.drop(columns=['fnlwgt'])

In [456]:
# 进行one-hot编码
train_data = pd.get_dummies(train_data, columns=['workclass', 'marital_status', 'occupation',
                                                 'relationship', 'race', 'sex',
                                                 'native_country'])
train_data

test_data = pd.get_dummies(test_data, columns=['workclass', 'marital_status', 'occupation',
                                                 'relationship', 'race', 'sex',
                                                 'native_country'])

In [457]:
# 看看编码后的结果
train_data.columns

Index(['age', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income', 'workclass_ Gov', 'workclass_ Private',
       'workclass_ Self-emp', 'workclass_ Un-emp', 'marital_status_ Couple',
       'marital_status_ Single', 'occupation_ Adm-clerical',
       'occupation_ Armed-Forces', 'occupation_ Craft-repair',
       'occupation_ Exec-managerial', 'occupation_ Farming-fishing',
       'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct',
       'occupation_ Other-service', 'occupation_ Priv-house-serv',
       'occupation_ Prof-specialty', 'occupation_ Protective-serv',
       'occupation_ Sales', 'occupation_ Tech-support',
       'occupation_ Transport-moving', 'relationship_ Husband',
       'relationship_ Not-in-family', 'relationship_ Other-relative',
       'relationship_ Own-child', 'relationship_ Unmarried',
       'relationship_ Wife', 'race_ Amer-Indian-Eskimo',
       'race_ Asian-Pac-Islander', 'race_ Black', 'race_ Other', 'race_ Whit

In [458]:
# 将非2值类型的数据进行标准化
train_data_need_to_standard = train_data[['age', 'education_num', 'capital_gain',
                                          'capital_loss', 'hours_per_week']]
train_data_need_to_standard

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40
5,37,14,0,0,40
6,49,5,0,0,16
7,52,9,0,0,45
8,31,14,14084,0,50
9,42,13,5178,0,40


In [459]:
scaler = StandardScaler()
scaler.fit(train_data_need_to_standard)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [460]:
train_data_standard = pd.DataFrame(scaler.transform(train_data_need_to_standard))
train_data_standard.head()

Unnamed: 0,0,1,2,3,4
0,0.042416,1.119909,0.142438,-0.219179,-0.079207
1,0.880958,1.119909,-0.147516,-0.219179,-2.331988
2,-0.033815,-0.441111,-0.147516,-0.219179,-0.079207
3,1.109651,-1.221621,-0.147516,-0.219179,-0.079207
4,-0.796125,1.119909,-0.147516,-0.219179,-0.079207


In [461]:
# 将标准化的数据添回原来的整个表里
# 但是我们之前可以看到他的列属性名字没有了，加回去
column_name = ['age', 'education_num', 'capital_gain', 
               'capital_loss', 'hours_per_week']
train_data_standard.columns = column_name
train_data_standard

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week
0,0.042416,1.119909,0.142438,-0.219179,-0.079207
1,0.880958,1.119909,-0.147516,-0.219179,-2.331988
2,-0.033815,-0.441111,-0.147516,-0.219179,-0.079207
3,1.109651,-1.221621,-0.147516,-0.219179,-0.079207
4,-0.796125,1.119909,-0.147516,-0.219179,-0.079207
5,-0.110046,1.510164,-0.147516,-0.219179,-0.079207
6,0.804727,-2.002131,-0.147516,-0.219179,-2.081679
7,1.033420,-0.441111,-0.147516,-0.219179,0.337974
8,-0.567432,1.510164,1.730916,-0.219179,0.755156
9,0.271109,1.119909,0.543092,-0.219179,-0.079207


In [462]:
# 用标准化数据覆盖原来的数据
for i in train_data_standard.columns:
    train_data[i] = train_data_standard[i]
train_data = train_data.dropna()
train_data


Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_ Gov,workclass_ Private,workclass_ Self-emp,workclass_ Un-emp,...,native_country_ Jamaica,native_country_ Japan,native_country_ Mexico,native_country_ Other,native_country_ Philippines,native_country_ Poland,native_country_ Puerto-Rico,native_country_ South,native_country_ United-States,native_country_ Vietnam
0,0.042416,1.119909,0.142438,-0.219179,-0.079207,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0.880958,1.119909,-0.147516,-0.219179,-2.331988,1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,-0.033815,-0.441111,-0.147516,-0.219179,-0.079207,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1.109651,-1.221621,-0.147516,-0.219179,-0.079207,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,-0.796125,1.119909,-0.147516,-0.219179,-0.079207,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-0.110046,1.510164,-0.147516,-0.219179,-0.079207,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0.804727,-2.002131,-0.147516,-0.219179,-2.081679,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.033420,-0.441111,-0.147516,-0.219179,0.337974,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
8,-0.567432,1.510164,1.730916,-0.219179,0.755156,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0.271109,1.119909,0.543092,-0.219179,-0.079207,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [463]:
# 把要预测的值income单独拿出来看看
y_train = train_data.income
x_train = train_data.drop('income', axis=1)

y_test = test_data.income
x_test = test_data.drop('income', axis=1)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

(28992, 59)
(28992,)
(16281, 59)
(16281,)


In [464]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [465]:
y_pred = lgbm.predict(x_test)
print(accuracy_score(y_test, y_pred) * 100)

82.34137952214238


In [466]:
lgbm_confusion_matrix = confusion_matrix(y_test, y_pred)
lgbm_confusion_matrix

array([[ 2459,  1387],
       [ 1488, 10947]], dtype=int64)