In [1]:
from shared.log_service import LogService
from shared.path_service import PathService
from injector import Injector
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
# 全局注入器对服务进行依赖注入
injector = Injector()
# 获取日志服务
log_service:LogService = injector.get(LogService)
path_service:PathService = injector.get(PathService)

2018-09-30 19:54:39,827 - INFO - [LogService]:LogService initialized.


In [3]:
rawdf_path = path_service.get_resource("lab1/bank-full.csv")

In [4]:
rawdf = pd.read_csv(rawdf_path,sep=";")
rawdf.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
(rawdf.columns.get_values())[rawdf.dtypes.get_values()=='object']
rawdf.drop([1],axis=0).head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no


In [6]:
def get_replacement_parts(rawdf: pd.DataFrame):
    numeric_cols = (rawdf.columns.get_values())[rawdf.dtypes.get_values() != "object"]
    str_cols = (rawdf.columns.get_values())[rawdf.dtypes.get_values() == "object"]
    numeric_means = pd.DataFrame(rawdf.replace('unknown', np.NaN)[numeric_cols].mean()).transpose()
    str_modes = rawdf.replace('unknown', np.NaN)[str_cols].mode()
    return numeric_means, str_modes


def get_replacements(df: pd.DataFrame):
    replaced = pd.concat(list(get_replacement_parts(df)), axis=1, sort=True)
    replaced_dic = dict.fromkeys(replaced.columns.get_values())
    for k in replaced_dic.keys():
        replaced_dic[k] = replaced.loc[0,k]
    return replaced_dic

In [7]:
numeric_means,str_modes = get_replacement_parts(rawdf)
numeric_means.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323


In [8]:
str_modes.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,blue-collar,married,secondary,no,yes,no,cellular,may,failure,no


In [9]:
replaced = get_replacements(rawdf)
replaced

{'age': 40.93621021432837,
 'balance': 1362.2720576850766,
 'day': 15.80641879188693,
 'duration': 258.1630797814691,
 'campaign': 2.763840658246887,
 'pdays': 40.19782796222158,
 'previous': 0.5803233726305546,
 'job': 'blue-collar',
 'marital': 'married',
 'education': 'secondary',
 'default': 'no',
 'housing': 'yes',
 'loan': 'no',
 'contact': 'cellular',
 'month': 'may',
 'poutcome': 'failure',
 'y': 'no'}

In [10]:
from typing import Collection, Any


def remove_missing(df: pd.DataFrame, targets: Collection[Any]):
    ret_df:pd.DataFrame = df.replace(targets,np.NaN,inplace=False)
    ret_df.fillna(get_replacements(ret_df),inplace=True)
    return ret_df

In [11]:
missing_removed_df = remove_missing(rawdf,["unknown"])
missing_removed_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,cellular,5,may,261,1,-1,0,failure,no
1,44,technician,single,secondary,no,29,yes,no,cellular,5,may,151,1,-1,0,failure,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,cellular,5,may,76,1,-1,0,failure,no
3,47,blue-collar,married,secondary,no,1506,yes,no,cellular,5,may,92,1,-1,0,failure,no
4,33,blue-collar,single,secondary,no,1,no,no,cellular,5,may,198,1,-1,0,failure,no


In [12]:
def discretize(df: pd.DataFrame, column: str, qbins: int):
    ret_df = df.copy()
    ret_df.loc[:,column] = pd.qcut(df[column],qbins)
    return ret_df

In [13]:
age_discretized_df =  discretize(missing_removed_df, 'age',10)
age_discretized_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,"(56.0, 95.0]",management,married,tertiary,no,2143,yes,no,cellular,5,may,261,1,-1,0,failure,no
1,"(42.0, 46.0]",technician,single,secondary,no,29,yes,no,cellular,5,may,151,1,-1,0,failure,no
2,"(32.0, 34.0]",entrepreneur,married,secondary,no,2,yes,yes,cellular,5,may,76,1,-1,0,failure,no
3,"(46.0, 51.0]",blue-collar,married,secondary,no,1506,yes,no,cellular,5,may,92,1,-1,0,failure,no
4,"(32.0, 34.0]",blue-collar,single,secondary,no,1,no,no,cellular,5,may,198,1,-1,0,failure,no


In [14]:
balance_discretized_df = discretize(age_discretized_df,"balance",10)
balance_discretized_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,"(56.0, 95.0]",management,married,tertiary,no,"(1859.0, 3574.0]",yes,no,cellular,5,may,261,1,-1,0,failure,no
1,"(42.0, 46.0]",technician,single,secondary,no,"(22.0, 131.0]",yes,no,cellular,5,may,151,1,-1,0,failure,no
2,"(32.0, 34.0]",entrepreneur,married,secondary,no,"(0.0, 22.0]",yes,yes,cellular,5,may,76,1,-1,0,failure,no
3,"(46.0, 51.0]",blue-collar,married,secondary,no,"(1126.0, 1859.0]",yes,no,cellular,5,may,92,1,-1,0,failure,no
4,"(32.0, 34.0]",blue-collar,single,secondary,no,"(0.0, 22.0]",no,no,cellular,5,may,198,1,-1,0,failure,no


In [15]:
deduplicated_df = balance_discretized_df.drop_duplicates(inplace=False) 
deduplicated_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,"(56.0, 95.0]",management,married,tertiary,no,"(1859.0, 3574.0]",yes,no,cellular,5,may,261,1,-1,0,failure,no
1,"(42.0, 46.0]",technician,single,secondary,no,"(22.0, 131.0]",yes,no,cellular,5,may,151,1,-1,0,failure,no
2,"(32.0, 34.0]",entrepreneur,married,secondary,no,"(0.0, 22.0]",yes,yes,cellular,5,may,76,1,-1,0,failure,no
3,"(46.0, 51.0]",blue-collar,married,secondary,no,"(1126.0, 1859.0]",yes,no,cellular,5,may,92,1,-1,0,failure,no
4,"(32.0, 34.0]",blue-collar,single,secondary,no,"(0.0, 22.0]",no,no,cellular,5,may,198,1,-1,0,failure,no


In [16]:
deduplicated_df.dtypes

age          category
job            object
marital        object
education      object
default        object
balance      category
housing        object
loan           object
contact        object
day             int64
month          object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y              object
dtype: object

In [17]:
import re


def normalize(df: pd.DataFrame):
    ret_df = df.copy()
    
    def converter(x: pd.Series):
        if bool(re.search(r'(int|float)\d*', str(x.dtype), flags=re.I)):
            return (x - np.min(x)) / (np.max(x) - np.min(x))
        return x
    
    return ret_df.apply(converter,axis=0)


In [18]:
finished_df = normalize(deduplicated_df)
finished_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,"(56.0, 95.0]",management,married,tertiary,no,"(1859.0, 3574.0]",yes,no,cellular,0.133333,may,0.05307,0.0,0.0,0.0,failure,no
1,"(42.0, 46.0]",technician,single,secondary,no,"(22.0, 131.0]",yes,no,cellular,0.133333,may,0.030704,0.0,0.0,0.0,failure,no
2,"(32.0, 34.0]",entrepreneur,married,secondary,no,"(0.0, 22.0]",yes,yes,cellular,0.133333,may,0.015453,0.0,0.0,0.0,failure,no
3,"(46.0, 51.0]",blue-collar,married,secondary,no,"(1126.0, 1859.0]",yes,no,cellular,0.133333,may,0.018707,0.0,0.0,0.0,failure,no
4,"(32.0, 34.0]",blue-collar,single,secondary,no,"(0.0, 22.0]",no,no,cellular,0.133333,may,0.04026,0.0,0.0,0.0,failure,no
