<a href="https://colab.research.google.com/github/ZQRui/ML-Assignment/blob/master/prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# !pip install xgboost



In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import xgboost as xgb
import json
import operator
from collections import OrderedDict
import os,sys

In [0]:
import logging

#DEBUG to const.LOGFILE
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', # 输出格式
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename="debug.log.txt",
                    filemode='a')

# to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s: %(levelname)-5s %(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)

#INFO to const.INFOLOGFILE
infolog = logging.FileHandler("info.log.txt")
infolog.setLevel(logging.INFO)
errformatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
infolog.setFormatter(errformatter)
logging.getLogger('').addHandler(infolog)

#ERROR to const.ERRLOGFILE
errlog = logging.FileHandler("error.log.txt")
errlog.setLevel(logging.WARNING)
errformatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
errlog.setFormatter(errformatter)
logging.getLogger('').addHandler(errlog)

In [3]:
print(tf.__version__)

1.13.1


# 读数据

In [0]:
dataset_file="./bank.csv"

In [102]:
df=pd.read_csv(dataset_file,sep=";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [103]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


# 建立onehot映射表

In [104]:
df.nunique()

age            67
job            12
marital         3
education       4
default         2
balance      2353
housing         2
loan            2
contact         3
day            31
month          12
duration      875
campaign       32
pdays         292
previous       24
poutcome        4
y               2
dtype: int64

In [0]:
onehot_cols=["job","marital","education","default","housing","loan","contact","month","poutcome"]

In [0]:
for col in onehot_cols:
  df[col]=df[col].astype("category")

In [0]:
onehot_newkey=[]
for col in onehot_cols:
  for v in df[col].unique():
    c=(f"{col}-{v}",col,v)
    onehot_newkey.append(c)

In [0]:
onehot_newkey.append(("pdays--1","pdays",-1))
onehot_newkey.append(("pdays-yes","pdays--1",0))

In [109]:
onehot_newkey

[('job-unemployed', 'job', 'unemployed'),
 ('job-services', 'job', 'services'),
 ('job-management', 'job', 'management'),
 ('job-blue-collar', 'job', 'blue-collar'),
 ('job-self-employed', 'job', 'self-employed'),
 ('job-technician', 'job', 'technician'),
 ('job-entrepreneur', 'job', 'entrepreneur'),
 ('job-admin.', 'job', 'admin.'),
 ('job-student', 'job', 'student'),
 ('job-housemaid', 'job', 'housemaid'),
 ('job-retired', 'job', 'retired'),
 ('job-unknown', 'job', 'unknown'),
 ('marital-married', 'marital', 'married'),
 ('marital-single', 'marital', 'single'),
 ('marital-divorced', 'marital', 'divorced'),
 ('education-primary', 'education', 'primary'),
 ('education-secondary', 'education', 'secondary'),
 ('education-tertiary', 'education', 'tertiary'),
 ('education-unknown', 'education', 'unknown'),
 ('default-no', 'default', 'no'),
 ('default-yes', 'default', 'yes'),
 ('housing-no', 'housing', 'no'),
 ('housing-yes', 'housing', 'yes'),
 ('loan-no', 'loan', 'no'),
 ('loan-yes', 'loa

In [0]:
conf={"onehot_cols":onehot_cols,"onehot_newkey":onehot_newkey}

In [0]:
with open("conf.json","w") as fp:
  json.dump(conf,fp,indent=2)

# Onehot编码及归一化

In [0]:
# for key,oldkey,value in onehot_newkey:
#   df[key]=(df[oldkey]==value).astype(int)
# for oldkey in onehot_cols:
#   df.pop(oldkey)

In [0]:
df["y"]=(df["y"]=="yes").astype(float)

In [0]:
def onehot(df,newkeys,oldkeys):
  df=df.copy()
  for key,oldkey,value in newkeys:
    logging.debug(f"{key},{oldkey},{value}")
    df[key]=(df[oldkey]==value).astype(float)
  for oldkey in oldkeys:
    df.pop(oldkey)
  return df

In [115]:
df2

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job-unemployed,job-services,job-management,job-blue-collar,job-self-employed,job-technician,job-entrepreneur,job-admin.,job-student,job-housemaid,job-retired,job-unknown,marital-married,marital-single,marital-divorced,education-primary,education-secondary,education-tertiary,education-unknown,default-no,default-yes,housing-no,housing-yes,loan-no,loan-yes,contact-cellular,contact-unknown,contact-telephone,month-oct,month-may,month-apr,month-jun,month-feb,month-aug,month-jan,month-jul,month-nov,month-sep,month-mar,month-dec,poutcome-unknown,poutcome-failure,poutcome-other,poutcome-success,pdays--1,pdays-yes
0,30,1787,19,79,1,-1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,33,4789,11,220,1,339,4,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2,35,1350,16,185,1,330,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,30,1476,3,199,4,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,59,0,5,226,1,-1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
5,35,747,23,141,2,176,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1
6,36,307,14,341,1,330,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
7,39,147,6,151,2,-1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
8,41,221,14,57,2,-1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
9,43,-88,17,313,1,147,2,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [0]:
df2=onehot(df,onehot_newkey,onehot_cols)

In [0]:
def split_train_test(source, frac, ):
    train_dataset = source.sample(frac=0.8, random_state=0)
    test_dataset = source.drop(train_dataset.index)

    return train_dataset, test_dataset

In [0]:
train_dataset,test_dataset=split_train_test(df2,0.8)

In [0]:
# y=train_dataset.pop("y")
train_labels = train_dataset.pop('y')
test_labels = test_dataset.pop('y')



In [135]:
stats=train_dataset.describe().transpose()
print(stats)

                      count         mean          std  ...    50%     75%      max
age                  3617.0    41.115842    10.573495  ...   39.0    49.0     87.0
balance              3617.0  1405.922311  2972.465627  ...  444.0  1465.0  71188.0
day                  3617.0    15.848217     8.220174  ...   16.0    21.0     31.0
duration             3617.0   268.094001   265.199283  ...  187.0   333.0   3025.0
campaign             3617.0     2.809511     3.137596  ...    2.0     3.0     50.0
pdays                3617.0    39.948023   100.672342  ...   -1.0    -1.0    871.0
previous             3617.0     0.553221     1.729015  ...    0.0     0.0     25.0
job-unemployed       3617.0     0.029030     0.167913  ...    0.0     0.0      1.0
job-services         3617.0     0.093171     0.290712  ...    0.0     0.0      1.0
job-management       3617.0     0.215648     0.411328  ...    0.0     0.0      1.0
job-blue-collar      3617.0     0.210395     0.407646  ...    0.0     0.0      1.0
job-

In [0]:
def norm(df,stats):
  return (df - stats['mean'])/stats['std']

In [0]:
normed_train_data=norm(train_dataset,stats)
normed_test_data = norm(test_dataset, stats)

# 特征权重计算

In [0]:
def feature_importance(df,Xl,yl):
    df=df.copy()
    features=Xl
    with open('xgb.fmap',"w") as fpmap:
        i=0
        for fe in features:
            fpmap.write(f"{i}\t{fe}\tq\n")
            i=i+1
    params = {
            'min_child_weight': 0,
            'eta': 0.02,
            'colsample_bytree': 0.7,
            'max_depth': 12,
            'subsample': 0.7,
            'alpha': 1,
            'gamma': 1,
            'silent': 1,
            'verbose_eval': True,
            'seed': 12
        }

    rounds=100
    y=df[yl]

    X=df[Xl]

    xgtrain=xgb.DMatrix(X,label=y)
    bst=xgb.train(params,xgtrain,num_boost_round=rounds)
    importance=bst.get_fscore(fmap="xgb.fmap")

    importance=sorted(importance.items(),key=operator.itemgetter(1),reverse=True)
    
    return importance

In [126]:
all_cols=list(df.columns)
all_cols.remove("y")
feature_sel_data=normed_train_dataset.copy()
feature_sel_data["y"]=train_labels
importance=feature_importance(feature_sel_data,normed_train_dataset.columns,"y")
importance

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


[('duration', 275),
 ('age', 131),
 ('day', 113),
 ('pdays', 87),
 ('balance', 80),
 ('poutcome-success', 71),
 ('month-oct', 51),
 ('month-mar', 49),
 ('previous', 39),
 ('contact-unknown', 30),
 ('month-apr', 28),
 ('housing-no', 25),
 ('marital-married', 23),
 ('month-feb', 22),
 ('education-tertiary', 21),
 ('campaign', 20),
 ('month-jun', 17),
 ('poutcome-other', 13),
 ('contact-cellular', 13),
 ('month-may', 13),
 ('month-nov', 10),
 ('month-sep', 9),
 ('contact-telephone', 7),
 ('month-dec', 7),
 ('loan-no', 7),
 ('job-management', 6),
 ('job-blue-collar', 6),
 ('job-student', 6),
 ('month-jul', 5),
 ('job-retired', 4),
 ('job-technician', 4),
 ('housing-yes', 4),
 ('month-aug', 4),
 ('poutcome-failure', 4),
 ('default-yes', 3),
 ('default-no', 3),
 ('month-jan', 3),
 ('job-entrepreneur', 2),
 ('job-housemaid', 2),
 ('education-primary', 2),
 ('job-unemployed', 2),
 ('marital-divorced', 2),
 ('education-unknown', 2),
 ('job-unknown', 2),
 ('pdays--1', 1),
 ('loan-yes', 1)]

In [0]:
args={"feature_count": 6, "train_data_frac": 0.85, "layers_count": [6, 5], "epochs": 5000, "acc": 0.9491525292396545, "run_time": 166.15646314620972}


In [0]:
import_features = [feature for feature, v in importance[:feature_count]]
normed_train_data=normed_train_data[import_features]
normed_test_data = normed_test_data[import_features]

In [0]:
def build_model():
    layers_list = [layers.Dense(layers_count[0], activation=tf.nn.relu, input_shape=[feature_count]), ] + \
                  [layers.Dense(count, activation=tf.nn.relu, ) for count in layers_count[1:]] + \
                  [layers.Dense(1, activation=tf.nn.sigmoid)]
    model = keras.Sequential(layers_list)
    #   optimizer = tf.keras.optimizers.RMSprop(0.001)
#     model.compile(loss='sparse_categorical_crossentropy',
#                   optimizer=tf.train.AdamOptimizer(),
#                   metrics=['accuracy'])
    model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

    return model

In [143]:
logging.info(f"evaluate model with arg {args}")
feature_count = args.get("feature_count", 5)

layers_count = args.get("layers_count", [5])
logging.debug(train_dataset.describe())

model=build_model()
EPOCHS = args.get("epochs", 1000)
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
logging.debug(normed_train_data.describe())
history = model.fit(
    normed_train_data, train_labels,
    epochs=EPOCHS, validation_split=0.2, verbose=0, callbacks=[early_stop]
)
test_loss, test_acc = model.evaluate(normed_test_data, test_labels)

logging.info(f"acc: {test_acc} loss:{test_loss} args: {args}")

2019-05-30 22:31:58,507: INFO  evaluate model with arg {'feature_count': 6, 'train_data_frac': 0.85, 'layers_count': [6, 5], 'epochs': 5000, 'acc': 0.9491525292396545, 'run_time': 166.15646314620972}


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.




2019-05-30 22:32:03,508: INFO  acc: 0.9126105904579163 loss:0.23113595727270683 args: {'feature_count': 6, 'train_data_frac': 0.85, 'layers_count': [6, 5], 'epochs': 5000, 'acc': 0.9491525292396545, 'run_time': 166.15646314620972}
