# 收入等级评估

In [53]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.naive_bayes import GaussianNB

In [50]:
file_path = './data/adult.csv'
X, y = [], []
count_lessthan50k = 0
count_morethan50k = 0
num_images_threshould = 10000

with open(file_path, 'r') as f:
    for line in f.readlines():
        if '?' in line:
            continue
        data = line[:-1].split(', ')  # 相当于读取了一条X, 索引为[:-1]是避免了读取到\n
        if data[-1] == '<=50K' and count_lessthan50k < num_images_threshould:
            X.append(data)
            count_lessthan50k += 1
        elif data[-1] == '>50K' and count_morethan50k < num_images_threshould:
            X.append(data)
            count_morethan50k += 1
        if count_lessthan50k >= num_images_threshould and count_morethan50k >= num_images_threshould:
            break
X = np.array(X)
X.shape

(17508, 15)

### Convert string to numerical data

In [52]:
label_encoder = []
X_encoded = np.empty(X.shape)
for i, item in enumerate(X[0]):
    if item.isdigit():
        X_encoded[:, i] = X[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder())
        X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i])
X = X_encoded[:, :-1].astype(int)
y = X_encoded[:, -1].astype(int)
X

array([[    39,      5,  77516, ...,      0,     40,     37],
       [    50,      4,  83311, ...,      0,     13,     37],
       [    38,      2, 215646, ...,      0,     40,     37],
       ...,
       [    53,      2, 321865, ...,      0,     40,     37],
       [    40,      2, 154374, ...,      0,     40,     37],
       [    52,      3, 287927, ...,      0,     40,     37]])

### Split the data into training and testing

In [54]:
# Cross validation
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=5
)
cf_gb = GaussianNB()
cf_gb.fit(X_train, y_train)
y_test_pred = cf_gb.predict(X_test)
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

### Extrac performance metrics

In [55]:
# F1 score
f1 = model_selection.cross_val_score(
    estimator=cf_gb,
    X=X,
    y=y,
    scoring='f1_weighted',
    cv=5
)

print('f1 score = ', 100 * f1.mean(), '%')

f1 score =  63.05876347570389 %


### 对于单条数据，分类器的工作方式

In [61]:
in_data = ['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40', 'United-States']

# 转换为数值形式
count = 0
in_data_encoded = [-1] * len(in_data)
for i, item in enumerate(in_data):
    if item.isdigit():
        in_data_encoded[i] = int(in_data[i])
    else:
        in_data_encoded[i] = int(label_encoder[count].transform([in_data[i]]))
        count += 1
in_data_encoded = np.array(in_data_encoded)

# 进行分类
output_class = cf_gb.predict([in_data_encoded])
print('分类出的数值形式为', output_class)
print('分类出的原始string为', label_encoder[-1].inverse_transform(output_class)[0])

分类出的数值形式为 [0]
分类出的原始string为 <=50K
