In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew,norm
from tqdm import tqdm  
import re

### 导入数据Y

In [2]:
Y=pd.read_csv('../datasets/tianchi/health/origin_data/meinian_round1_train_20180408.csv',
              engine='python',encoding="gbk")
Y_pred=pd.read_csv('../datasets/tianchi/health/b_round/meinian_round1_test_b_20180505.csv',
                   engine='python',encoding="gbk")

Y.columns=["vid","Systolic",'Diastolic','Glycerin','HDC','LDC']
Y_pred.columns=["vid","Systolic",'Diastolic','Glycerin','HDC','LDC']

### 我们发现Y的前三项数据由于存在字符无法转成数值型数据、用正则表达式提取数据

In [3]:
m_test=Y_pred.shape[0]
m_train=Y.shape[0]
columns=['Systolic','Diastolic','Glycerin']
for col in columns:
    temp=[]
    for i in range(m_train):
        pattern = re.compile(r'\d+\.{0,1}\d+')   ##数值中间最多允许出现一个小数点
        try:
            temp.append(pattern.findall(Y[col][i])[0])
        except:
            temp.append(np.nan)
    Y[col]=temp
    Y[col]=Y[col].astype("float32")
    Y[col]=Y[col].fillna(Y[col].mean())

### distplot Y的列，我们可以发现Diastolic有两个异常,并且HDC有负值(取绝对值处理),

In [4]:
Y=Y[Y["Diastolic"]<200]        ## 删除异常值
Y["LDC"]=np.abs(Y["LDC"])      ## 将负值取绝对值
Y=Y.set_index("vid")
Y_pred=Y_pred.set_index("vid")

### 导入X  并生成数据透视表

In [56]:
# X=[]
# with open("../datasets/tianchi/health/origin_data/meinian_round1_data_part1_20180408.txt","r") as f:
#     for line in f.readlines():
#         x=line.strip().split("$")
#         X.append(x)

# X1=pd.DataFrame(X[1:],columns=["vid","table_id","field_results"])

# X=[]
# with open("../datasets/tianchi/health/origin_data/meinian_round1_data_part2_20180408.txt","r") as f:
#     for line in f.readlines():
#         x=line.strip().split("$")
#         X.append(x)
# X2=pd.DataFrame(X[1:],columns=["vid","table_id","field_results"])

# X=pd.concat([X1,X2],axis=0)

In [57]:
# X1_train=pd.pivot_table(X,index='vid',
#                        columns='table_id', 
#                        values='field_results',
#                        fill_value=np.nan,
#                        aggfunc=lambda x: ' '.join(x))

### 只提取train和test中有的数据并将其合并,一起进行数值处理

In [58]:
# train_list=Y["vid"].values
# X_train=X1_train.loc[train_list]

# ###this is data for prediction
# test_list=Y_pred["vid"].values
# X_test=X1_train.loc[test_list]

# # X_train.to_csv('../datasets/tianchi/health/train.csv',index=True)
# # X_test.to_csv('../datasets/tianchi/health/test.csv',index=True)




In [59]:
X_train=pd.read_csv('../datasets/tianchi/health/train.csv',low_memory=False)
X_test=pd.read_csv('../datasets/tianchi/health/test.csv',low_memory=False)
m_train=X_train.shape[0]
m_test=X_test.shape[0]

all_data=pd.concat([X_train,X_test],axis=0)
print("The shape of all data is {}".format(all_data.shape))

KeyboardInterrupt: 

### 删除缺失值超过97%的feature

In [58]:
to_drop=all_data.isnull().sum().sort_values(ascending=False)/len(all_data)*100
to_drop=to_drop[to_drop>97.0]
all_data.drop(to_drop.index,axis=1,inplace=True)

### 根据其他项信息提取性别Feature

In [59]:
gender=[]
all_data["0101"]=all_data["0101"].astype(str)
all_data["0102"]=all_data["0102"].astype(str)
all_data["0539"]=all_data["0539"].astype(str)
all_data["0120"]=all_data["0120"].astype(str)
all_data["0121"]=all_data["0121"].astype(str)
all_data["0929"]=all_data["0929"].astype(str)
for i in range(all_data.shape[0]):

    if "乳腺" in all_data["0101"].values[i]:
        gender.append("F")
    elif "乳房" in all_data["0101"].values[i]:
        gender.append("F")
    elif "乳腺" in all_data["0102"].values[i]:
        gender.append("F")
    elif "子宫" in all_data["0102"].values[i]:
        gender.append("F")
    elif "乳腺" in all_data["0121"].values[i]:
        gender.append("F")
    elif "子宫" in all_data["0121"].values[i]:
        gender.append("F")
    elif "阴道" in all_data["0539"].values[i]:
        gender.append("F")
    elif "妇科" in all_data["0539"].values[i]:
        gender.append("F")
    elif "宫颈" in all_data["0539"].values[i]:
        gender.append("F")
    elif "乳腺" in all_data["0929"].values[i]:
        gender.append("F")
    elif "小叶增生" in all_data["0929"].values[i]:
        gender.append("F")
    elif "前列腺" in all_data["0102"].values[i]:
        gender.append("M")
    elif "前列腺" in all_data["0120"].values[i]:
        gender.append("M")
    else:
        gender.append("unkown")
all_data["gender"]=gender

In [60]:
age=[]
all_data["3601"]=all_data["3601"].astype(str)
all_data["0102"]=all_data["0102"].astype(str)
all_data["0709"]=all_data["0709"].astype(str)
all_data["0730"]=all_data["0730"].astype(str)
all_data["0120"]=all_data["0120"].astype(str)
all_data["A202"]=all_data["A202"].astype(str)
all_data["0409"]=all_data["0409"].astype(str)
all_data["1102"]=all_data["1102"].astype(str)
all_data["1103"]=all_data["1103"].astype(str)
all_data["1308"]=all_data["1308"].astype(str)
all_data["0546"]=all_data["0546"].astype(str)
all_data["0984"]=all_data["0984"].astype(str)
for i in tqdm(range(all_data.shape[0])):

    if "骨质增生" in all_data["1102"].values[i]:  ###骨质增生与疏松
        age.append("old")
    elif "退行性变" in all_data["1102"].values[i]:
        age.append("old")
    elif "骨质增生" in all_data["1103"].values[i]:
        age.append("old")
    elif "骨质疏松" in all_data["3601"].values[i]:
        age.append("old")
    elif "减少" in all_data["3601"].values[i]:
        age.append("old")
    elif "骨密度降低" in all_data["3601"].values[i]:
        age.append("old")
    elif "绝经" in all_data["0546"].values[i]:    ###绝经情况
        age.append("old")
    elif "闭经" in all_data["0546"].values[i]:
        age.append("old")
    elif "停经" in all_data["0546"].values[i]:
        age.append("old")
    elif "绝经" in all_data["0102"].values[i]:
        age.append("old")
    elif "高血压" in all_data["0409"].values[i]:   ####三高病史
        age.append("old")
    elif "糖尿病" in all_data["0409"].values[i]:
        age.append("old")
    elif "冠心病" in all_data["0409"].values[i]:
        age.append("old")
        
    elif "增大" in all_data["0120"].values[i]:   ###前列腺增大
        age.append("old")
    elif "义齿" in all_data["0709"].values[i]:   ###是否有义齿
        age.append("old")
        
    elif "老年环" in all_data["1308"].values[i]:   ###老年眼科病
        age.append("old")
    elif "白内障" in all_data["1308"].values[i]:   
        age.append("old")
    elif "玻璃体浑浊" in all_data["1308"].values[i]:   
        age.append("old")
    elif "增生" in all_data["0984"].values[i]:   
        age.append("old")
    elif "lmp" in all_data["0546"].values[i].lower():
        age.append("young")
    elif "月经" in all_data["0546"].values[i]:
        age.append("young")
    elif "哺乳" in all_data["0546"].values[i]:
        age.append("young")
    else:
        age.append("unknown")
all_data["age"]=age

100%|██████████| 47735/47735 [00:03<00:00, 14233.28it/s]


In [61]:
all_data["num_items"]=all_data.isnull().sum(axis=1)

In [62]:
##将正常数据（表达不同），均映射为形同的值
#X['field_results'].value_counts().sort_values(ascending=False)[:50]
all_data=all_data.astype(str)
for col in tqdm(all_data.columns):
    all_data[col]=all_data[col].replace({"弃查":np.nan,           ## 将result数据做基本处理.相同意义的数据替换
                                         "正常 正常":'正常',
                                         "未见异常 未见异常":'正常',
                                         "未触及 未触及":"正常",
                                             "未见异常":"正常",
                                             "未见明显异常":"正常",
                                            "未见异常，活动自如":"正常",
                                            "健康":"正常",
                                            "整齐":"正常",
                                            "详见纸质报告":np.nan,
                                            "未查":np.nan,
                                            "未触及":"正常",
                                            "正常心电图":"正常",
                                             "窦性心律正常心电图 ":"正常",
                                            "骨量正常":"正常",
                                            "耳鼻喉检查未见异常":"正常",
                                            "外科检查未发现明显异常":"正常",
                                            "内科检查未发现明显异常":"正常",
                                            "右附件区未见明显异常回声":"正常",
                                            "胰腺大小、形态正常，边缘规整，内部回声均匀，胰管未见扩张。":"正常",
                                            "右肾大小、形态正常，包膜光滑，肾实质回声均匀，集合系统未见明显分离。":"正常",
                                            "左肾大小、形态正常，包膜光滑，肾实质回声均匀，集合系统未见明显分离。":"正常",
                                            "胆囊大小、形态正常，囊壁光整，囊腔内透声好，胆总管无扩张。":"正常",
                                            "膀胱充盈良好，壁光滑，延续性好，其内透声性良好，未见明显占位性病变。":"正常",
                                            "脾脏大小、形态正常，包膜光整，回声均匀。":"正常",
                                            "脾脏大小、形态正常，包膜光整，内光点均匀。":"正常",
                                            "右附件区未见明显异常回声。":"正常",
                                            "左附件区未见明显异常回声。":"正常",
                                            "肝、胆、胰、脾、左肾、右肾未发现明显异常":"正常",
                                            "肝脏大小、形态正常，包膜光整，肝内血管走行较清晰，回声均匀。":"正常",
                                            "前列腺大小、形态正常，包膜光滑完整，两侧对称，内部回声均匀。":"正常",
                                            "甲状腺形态大小正常，边界清晰，内部回声分布均匀，未见明显异常回声。":"正常",
                                            "双侧甲状腺大小形态正常，包膜光整，实质回声均匀，未见明显异常回声。CDFI：血流显示未见异常。":"正常",
                                            "胸廓对称，双肺纹理清晰，走行自然，未见异常实变影，双肺门不大。纵隔窗示纵隔无偏移，心影及大血管形态正常，纵隔内未见肿块及肿大淋巴结。胸腔内未见积液。":"正常",
                                            "脾脏大小测值正常，回声均匀，脾静脉测值正常。":"正常",
                                            "甲状腺彩超未发现明显异常":"正常",
                                            "肝脏大小、形态正常，包膜光整，肝内血管走行较清晰，光点分布尚均匀，其内未见明显异常光团。":"正常",
                                            "无特殊记载":"正常",
                                            "胰腺头、体、尾大小测值正常，内回声均匀。":"正常",
                                            "前列腺未发现明显异常":"正常",
                                            "双侧颈总动脉管径对称，内中膜不增厚,血流速度正常。双侧颈总动脉分叉处管径对称，内中膜不增厚，血流速度正常。双侧颈内、外动脉管径对称，管壁回声正常，血流速度正常。":"正常",
                                            "回声正常，血流速度正常。":"正常",
                                            "胆囊大小正常，壁光滑，腔内暗区清晰，胆总管测值正常范围。":"正常"})

100%|██████████| 431/431 [00:47<00:00,  9.09it/s]


In [63]:
all_data.to_csv("../datasets/tianchi/health/v9/all_data_pivot-v9.csv",index=False)

## Feature enginering 

In [64]:
all_data=pd.read_csv("../datasets/tianchi/health/v9/all_data_pivot-v9.csv",low_memory=False)
print(all_data.shape)

(47735, 431)


### 提取 numeric feature
以下用mean填充

In [65]:
columns=["0424","10004","1117","1321","1322","190","191","192","2403","2404","2405","316","320",
         "1814","1815","1840","1850","2372","31","32","33","34","38","39","37","312","313","315",
         "2406","1127","155","269003","269004","269005","269006","269008","269009","269010",
        "269012","269013","269014","269015","269016","269017","269018","269019","269020","269021",
        "269022","269023","269024","269025","1845"]

for col in tqdm(columns):
    all_data[col]=all_data[col].astype(str)
    temp=[]
    for i in range(len(all_data)):
        pattern = re.compile(r'\d+\.{0,1}\d+')
        try:
            temp.append(pattern.findall(all_data[col][i])[0])
        except:
            temp.append(np.nan)
    all_data[col]=temp
    all_data[col]=all_data[col].astype("float32")
    all_data[col]=all_data.groupby(["gender","age"])[col].transform(lambda x: x.fillna(x.mean()))

100%|██████████| 53/53 [00:54<00:00,  1.02s/it]


In [66]:
columns1=["0424","10004","1117","1321","1322",'459161', '809035', '459156', '319100', '0111', '809016',
         "2410","2165","2411","2421","2413","10013","2168","1842","2412","300028","300048","300113","709001",
         "100008","300044","0104","300067","300125","0107","300009","300014","809003",
         '459155', '0105', '459158', '0106', '300006', '311', '3184', '35', '300129', '0109', '310', 
         "1814","1815","183","1840","1850","31","32","33","34","38","39","37","312",'809030', '36',
         "313","315","316","320","190","191","192","2403","2404","2405",'300069', '459159', '0108', '1124',
         "2406","1127","155","269003","269004","269005","269006","269008","269009","269010", '459154',
        "269012","269013","269014","269015","269016","269017","269018","269019","269020","269021",
        "269022","269023","269024","269025","100012","100013","100014","10009","1106","1107","1112","1325",
        "1326","139","143","1474","2386","2409","269007","300001","300008","300011","300012","300013",
        "300021","300092","669001","669002","669004","669005","669006","669009","669021","809001",
        "809004","809008","809009","809010","809013","809017","809021","809023","809025","809026","979001","979002",
        "979003","979004","979005","979006","979007","979008","979009","004997","1110","1319","1320","1844","1873",
        "189","20002","279006","300007","300068","300070","300074","300076","669003","669007","669008",
        "809013","809018","809019","809022","809027","1125","1331","1845","979011","979012",
        "2390","2407","2986","300035","30006","300078","321","809002","809007","809020","809024","809029",
        "809031","809032","809033","809034","979010","979025","979026","979027","A701","A703",
        ]+[str(i) for i in range(979013,979024,1)]+[str(i) for i in range(809037,809062,1)]
for col in tqdm(set(columns1)-set(columns)):
    all_data[col]=all_data[col].astype(str)
    temp=[]
    for i in range(len(all_data)):
        pattern = re.compile(r'\d+\.{0,1}\d+')
        try:
            temp.append(pattern.findall(all_data[col][i])[0])
        except:
            temp.append(np.nan)
    all_data[col]=temp
    all_data[col]=all_data[col].astype("float32")
    all_data[col]=all_data[col].fillna(all_data[col].mean())

100%|██████████| 179/179 [02:45<00:00,  1.08it/s]


以下数据用0填充

In [67]:
columns=["300005","3429","3193","3730","2177","2376","300017","300018","300019","979024","269026",
         "669024","2371","300036","1363"]
for col in tqdm(columns):
    all_data[col]=all_data[col].astype(str)
    all_data[col].fillna("None",inplace=True)
    temp=[]
    for i,j in enumerate(all_data[col].values):
        
        if "+" in j or "阳性" in j:
            if col=="3730" or col=="300019" or col=="2371":
                temp.append(5)
            elif col=="669004":
                temp.append(1)
            else:
                temp.append(40)
        elif "-" in j or "阴性" in j:
            temp.append(0)
        else:
            pattern = re.compile(r'\d+\.{0,1}\d+')
            try:
                temp.append(pattern.findall(j)[0])
            except:
                temp.append(np.nan)
    all_data[col]=temp
    all_data[col]=all_data[col].astype("float32")
    all_data[col]=all_data[col].fillna(all_data[col].mean())

100%|██████████| 15/15 [00:02<00:00,  5.94it/s]


###夹杂着阳性数据的单独处理，阳性按照较大值处理,缺失值按照阴性0处理

根据后面模型的feature_importance，下面的数据，importance高，但缺失值非常多，我们着重处理   
严重：“193”，“10002”，“0425”，“319”，“2372”，“314”，“100007”，“2174”，“1115”，“2333”，“317”，“10003”，“100006”，“183”  
非常严重：”100005“，“269011”，“2420”，”1345“，

In [68]:
columns=["193","10002","0425","319","2372","314","100007","2174","1115","2333","317","10003",
         "100006","183","100005","269011","2420","1345"]
for col in tqdm(columns):
    all_data[col]=all_data[col].astype(str)
    temp=[]
    for i in range(len(all_data)):
        pattern = re.compile(r'\d+\.{0,1}\d+')
        try:
            temp.append(pattern.findall(all_data[col][i])[0])
        except:
            temp.append(np.nan)
    all_data[col]=temp
    all_data[col]=all_data[col].astype("float32")

100%|██████████| 18/18 [00:16<00:00,  1.08it/s]


In [69]:
# ###all_data.corr()["1345"].sort_values(ascending=False)[:10]

In [70]:
all_data["193"]=all_data.groupby(all_data["192"]>all_data["192"].mean()
                                )["193"].transform(lambda x: x.fillna(x.mean()))

all_data["10002"]=all_data.groupby(all_data["192"]>all_data["192"].mean()
                                )["10002"].transform(lambda x: x.fillna(x.mean()))

all_data["0425"]=all_data.groupby(all_data["0424"]>all_data["0424"].mean()
                                )["0425"].transform(lambda x: x.fillna(x.mean()))

all_data["319"]=all_data.groupby(all_data["312"]>all_data["312"].mean()
                                )["319"].transform(lambda x: x.fillna(x.mean()))

###没有相关性较好的数据，直接用平均值填充
all_data["2372"]=all_data["2372"].fillna(all_data["2372"].mean())

all_data["314"]=all_data.groupby(all_data["37"]>all_data["37"].mean()
                                )["314"].transform(lambda x: x.fillna(x.mean()))
all_data["100007"]=all_data["100007"].fillna(all_data["100007"].mean())

all_data["2174"]=all_data.groupby(all_data["1845"]>all_data["1845"].mean()
                                )["2174"].transform(lambda x: x.fillna(x.mean()))

all_data["1115"]=all_data.groupby(all_data["1117"]>all_data["1117"].median()
                                )["1115"].transform(lambda x: x.fillna(x.mean()))

all_data["2333"]=all_data["2333"].fillna(all_data["2333"].mean())

all_data["317"]=all_data.groupby(all_data["316"]>all_data["316"].mean()
                                )["317"].transform(lambda x: x.fillna(x.mean()))

all_data["10003"]=all_data.groupby(all_data["183"]>all_data["183"].mean()
                                )["10003"].transform(lambda x: x.fillna(x.mean()))

all_data["100006"]=all_data["100006"].fillna(all_data["100006"].mean())

all_data["100005"]=all_data.groupby(all_data["320"]>all_data["320"].mean()
                                )["100005"].transform(lambda x: x.fillna(x.mean()))

all_data["183"]=all_data.groupby(all_data["10003"]>all_data["10003"].mean()
                                )["183"].transform(lambda x: x.fillna(x.mean()))

all_data["269011"]=all_data.groupby(all_data["38"]>all_data["38"].mean()
                                )["269011"].transform(lambda x: x.fillna(x.mean()))

all_data["2420"]=all_data.groupby(all_data["0424"]>all_data["0424"].mean()
                                )["2420"].transform(lambda x: x.fillna(x.mean()))

all_data["1345"]=all_data.groupby(all_data["100012"]>all_data["100012"].mean()
                                )["1345"].transform(lambda x: x.fillna(x.mean()))

 找出现相关性最高的group后进行处理

In [71]:
all_data.to_csv("../datasets/tianchi/health/v9/all_data_pivot_numeric_done-v9.csv",index=False)

In [72]:
all_data=pd.read_csv("../datasets/tianchi/health/v9/all_data_pivot_numeric_done-v9.csv",low_memory=False)
# ##all_data[all_data.dtypes[all_data.dtypes !="object"].index].isnull().sum().sort_values()

In [73]:
# all_data.describe()

### 处理异常值

In [74]:
all_data=all_data.set_index("vid")
X_train=all_data[:-m_test]
X_test=all_data[-m_test:]

In [75]:
df=pd.concat([X_train,Y],axis=1)

In [76]:
# plt.scatter(df["459154"],df["Systolic"])

##### 其实也可以用方差，均值方法，提出异常值

In [77]:
#all_data.dtypes[all_data.dtypes !="object"].index
df=df[df["0104"]<100]
df=df[df["100008"]<10]
df=df[df["709001"]<1]
df=df[df["2412"]<100]
df=df[df["1842"]<100]
df=df[df["10013"]<400]
df=df[df["2411"]<1000]
df=df[df["1363"]<250]
df=df[df["2410"]<60]
df=df[df["A701"]<100]
df=df[df["809031"]<100]
df=df[df["300035"]<80]
df=df[df["2986"]<100]
df=df[df["2407"]<4000]
df=df[df["1331"]<4]
df=df[df["669008"]<100]
df=df[df["300076"]<100]
df=df[df["300074"]<20]
df=df[df["300070"]<15]
df=df[df["300068"]<50]
df=df[df["1873"]<40]
df=df[df["1844"]<4]
df=df[df["1110"]<50]
df=df[df["979014"]<2.0]
df=df[df["979002"]<1.5]
df=df[df["809025"]<15]
df=df[df["809009"]<15]
df=df[df["669021"]<100]
df=df[df["669006"]<60]
df=df[df["669004"]<40]
df=df[df["669002"]<15]
df=df[df["669001"]<40]
df=df[df["300017"]<20]
df=df[df["300012"]<20]
df=df[df["300008"]<4]
df=df[df["300001"]<60]
df=df[df["2376"]<800]
df=df[df["2177"]<200]
df=df[df["1474"]<300]
df=df[df["139"]<6]
df=df[df["1112"]<20]
df=df[df["10009"]<60]
df=df[df["100014"]<60]
df=df[df["100013"]<12]
df=df[df["269023"]<1.5]
df=df[df["269022"]<4]
df=df[df["269014"]<15]
df=df[df["269005"]<3]
df=df[df["155"]<40]
df=df[df["1345"]<400]
df=df[df["1127"]<3000]
df=df[df["34"]<2]
df=df[df["33"]<10]
df=df[df["317"]>220]
df=df[df["312"]<20]
df=df[df["2372"]<20]
df=df[df["2333"]<10]
df=df[df["10003"]<60]
df=df[df["10004"]<500]
df=df[df["1115"]<300]
df=df[df["10002"]<40]
df=df[df["1117"]<400]
df=df[df["1814"]<400]
df=df[df["1815"]<200]
df=df[(df["183"]>50)&(df["183"]<120)]
df=df[df["1850"]<40]
df=df[df["190"]<250]
df=df[df["192"]<100]
df=df[df["193"]<30]
df=df[df["2174"]>30]
df=df[(df["2403"]>20)&(df["2403"]<10000)]
df=df[df["2405"]>10]
df=df[df["300005"]<100]
df=df[df["3429"]<100]
df=df[df["3730"]<10]

In [78]:
df.shape

(37876, 435)

### 绘制heatmap

In [None]:
# fig=plt.figure(figsize=(12,10))
# sns.heatmap(df.corr())

In [79]:
X_train=df.iloc[:,:-5]
Y_train=df.iloc[:,-5:]

all_data=pd.concat([X_train,X_test],axis=0)
num_train=X_train.shape[0]
num_test=X_test.shape[0]
print(num_train,num_test,Y_train.shape)
Y_train.to_csv("../datasets/tianchi/health/v9/Y_train_numeric_done-v9.csv")

37876 9538 (37876, 5)


In [80]:
all_data.to_csv("../datasets/tianchi/health/v9/all_data_outlier_done-v9.csv")

##########################################################################
####数值型数据处理完成

In [5]:
all_data=pd.read_csv("../datasets/tianchi/health/b_round/all_data_outlier_done-v9.csv",low_memory=False)
all_data=all_data.set_index("vid")

### 处理离散型数据

用replace 及map函数处理离散变量

In [6]:
all_data["30007"]=all_data["30007"].replace({"Ⅱ":2,"Ⅲ":3,"Ⅰ":1,"Ⅳ":4,"Ⅱ":2,"Ⅲ":3,"Ⅰ":1,"Ⅳ":4,"Ⅱ度":2,
                                            "II":2,"Ⅲ度":3,"正常":0,"中度":2,"III":3,"ii°":2,"iii°":3,"Ⅰ°":1,
                                            "Ⅱ°":2,"Ⅰ度":1,"Ⅳ度":4,"见TCT":0,"yellow":0,"-":0,"结果见TCT":0,
                                            "Ⅳ°":4,"阴性":0,"微混":0,"Ⅲ°":3,"+":2,"I":1,"见刮片":0,"Ⅱv":2,
                                            "Ⅰ Ⅰ":1,np.nan:0,"iv°":4,"i°":1}).astype("float")

all_data["0431"]=all_data["0431"].replace({"无":"正常","无 无":"正常","无压痛点":"正常","未见异常 未见异常":"正常",np.nan:"未查"})

all_data["0976"]=all_data["0976"].replace({"无":"正常","无 无":"正常",np.nan:"未查"})



all_data["3400"]=all_data["3400"].map({"透明":"透明","浑浊":"浑浊","混浊":"浑浊","微混":"浑浊"})
all_data["3400"].fillna("透明",inplace=True)
all_data["0215"]=all_data["0215"].map({np.nan:"未查","正常":"正常"})
all_data["0215"]=all_data["0215"].fillna("异常")
all_data["0216"]=all_data["0216"].map({np.nan:"未查","正常":"正常","正常 正常":"正常","未见异常 未见异常":"正常"})
all_data["0216"]=all_data["0216"].fillna("异常")
all_data["0217"]=all_data["0217"].map({np.nan:"未查","正常":"正常","正常 正常":"正常","未见异常 未见异常":"正常"})
all_data["0217"]=all_data["0217"].fillna("异常")
all_data["0405"]=all_data["0405"].map({np.nan:"未查","未闻及":"正常","正常":"正常","无":"正常","未见异常 未见异常":"正常"})
all_data["0405"]=all_data["0405"].fillna("异常")
all_data["0406"]=all_data["0406"].map({np.nan:"未查","未触及 未触及":"正常","正常":"正常","未及":"正常",
                                       "未见异常 未见异常":"正常"})
all_data["0406"]=all_data["0406"].fillna("异常")
all_data["0407"]=all_data["0407"].map({np.nan:"未查","未触及 未触及":"正常","未及":"正常","正常":"正常",
                                       "未见异常 未见异常":"正常","未触及":"正常","不大":"正常",})
all_data["0407"]=all_data["0407"].fillna("异常")
all_data["0420"]=all_data["0420"].map({np.nan:"未查","未闻及异常":"正常","正常 正常":"正常","正常":"正常",
                                       "未见异常 未见异常":"正常","有力":"正常",})
all_data["0420"]=all_data["0420"].fillna("异常")


In [7]:

all_data["0409"]=all_data["0409"]+all_data["0434"]  ## 两次病史检查的数据合并处理，避免重复统计
temp=[]
all_data["0409"]=all_data["0409"].astype("str")
for i in range(len(all_data)):
    if "高血压" in all_data["0409"][i]:
        temp.append(1)
    elif "血压偏高" in all_data["0409"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["高血压史"]=temp

temp=[]
for i in range(len(all_data)):
    if "高血脂" in all_data["0409"][i]:
        temp.append(1)
    elif "血脂偏高" in all_data["0409"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["高血脂史"]=temp
                        
temp=[]
for i in range(len(all_data)):
    if "糖尿病" in all_data["0409"][i]:
        temp.append(1)
    elif "血糖偏高" in all_data["0409"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["糖尿病史"]=temp

temp=[]
for i in range(len(all_data)):
    if "冠心病" in all_data["0409"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["冠心病史"]=temp
                        
temp=[]
for i in range(len(all_data)):
    if "肝" in all_data["0409"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["肝病史"]=temp

                        
temp=[]
all_data["0439"]=all_data["0439"].astype("str")
for i in range(len(all_data)):
    if "冠心病" in all_data["0439"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["父母冠心病"]=temp
                        
temp=[]
for i in range(len(all_data)):
    if "高血压" in all_data["0439"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["父母高血压"]=temp
                        
temp=[]
for i in range(len(all_data)):
    if "糖尿病" in all_data["0439"][i]:
        temp.append(1)
    else:
        temp.append(0)
all_data["父母糖尿病"]=temp

In [8]:
columns=["4001"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "重度减弱" in all_data[col][i]:
            temp.append(3)
        elif "中度减弱" in all_data[col][i]:
            temp.append(2)
        elif "减弱" in all_data[col][i]:
            temp.append(1)
        elif "轻度硬化" in all_data[col][i]:
            temp.append(1)
        elif "硬化" in all_data[col][i]:
            temp.append(2)
        elif "钙化" in all_data[col][i]:
            temp.append(3)
        else:
            temp.append(0)
    all_data[col]=temp

columns=["0436"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "无过敏" in all_data[col][i]:
            temp.append("正常")
        elif "过敏史不详" in all_data[col][i]:
            temp.append("正常")
        elif "过敏" in all_data[col][i]:
            temp.append("过敏")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp
    
columns=["1402"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "增快" in all_data[col][i]:
            temp.append("增快")
        elif "减慢" in all_data[col][i]:
            temp.append("减慢")
        elif "弹性降低" in all_data[col][i]:
            temp.append("弹性降低")
        elif "顺应性降低" in all_data[col][i]:
            temp.append("顺应性降低")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp

    
columns=["A705"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "脂肪肝" in all_data[col][i]:
            temp.append("脂肪肝")
        elif "脂肪含量超过正常值" in all_data[col][i]:
            temp.append("脂肪肝")
        elif "硬度值偏高" in all_data[col][i]:
            temp.append("肝硬化")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp
    
    
columns=["0987"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "术后" in all_data[col][i]:
            temp.append("术后")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp
    
columns=["0984"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "增生" in all_data[col][i]:
            temp.append("增生")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp
    
columns=["1308","1316","1330"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "动脉硬化" in all_data[col][i]:
            temp.append("病变")
        elif "黄斑" in all_data[col][i]:
            temp.append("病变")
        elif "弧形斑" in all_data[col][i]:
            temp.append("病变")
        elif "色素斑" in all_data[col][i]:
            temp.append("病变")
        elif "病变" in all_data[col][i]:
            temp.append("病变")
        elif "豹纹状眼底" in all_data[col][i]:
            temp.append("病变")
        elif "结膜炎" in all_data[col][i]:
            temp.append("病变")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp




columns=["0113","0114","0115","0117","0118","0120","0121","0122","0123","0124"]
for col in columns:
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "高回声" in all_data[col][i]:
            temp.append("高回声")
        elif "强回声" in all_data[col][i]:
            temp.append("高回声")
        elif "低回声" in all_data[col][i]:
            temp.append("低回声")
        elif "弱回声" in all_data[col][i]:
            temp.append("低回声")
        elif "无回声" in all_data[col][i]:
            temp.append("无回声")
        elif "弥漫性" in all_data[col][i]:
            temp.append("弥漫性")
        elif "欠清晰" in all_data[col][i]:
            temp.append("欠清晰")
        elif all_data[col][i]=='nan':
            temp.append("未查")
        else:
            temp.append("正常")
    all_data[col]=temp



temp=[]
all_data["0421"]=all_data["0421"].astype("str")
for i in range(len(all_data)):
    if "早搏" in all_data["0421"][i]:
        temp.append("早搏")
    elif "房颤" in all_data["0421"][i]:
        temp.append("房颤")
    elif "过速" in all_data["0421"][i]:
        temp.append("过速")
    elif "过缓" in all_data["0421"][i]:
        temp.append("过缓")
    elif "不齐" in all_data["0421"][i]:
        temp.append("不齐")
    elif all_data["0421"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0421"]=temp

temp=[]
all_data["3601"]=all_data["3601"].astype("str")
for i in range(len(all_data)):
    if "严重骨质疏松" in all_data["3601"][i]:
        temp.append("严重骨质疏松")
    elif "疏松" in all_data["3601"][i]:
        temp.append("疏松")
    elif "减少" in all_data["3601"][i]:
        temp.append("减少")
    elif "降低" in all_data["3601"][i]:
        temp.append("降低")
    elif all_data["3601"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["3601"]=temp



temp=[]
all_data["0426"]=all_data["0426"].astype("str")
for i in range(len(all_data)):
    if "收缩期杂音" in all_data["0426"][i]:
        temp.append("收缩期杂音")
    elif "舒张期杂音" in all_data["0426"][i]:
        temp.append("舒张期杂音")
    elif all_data["0426"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0426"]=temp

temp=[]
all_data["0435"]=all_data["0435"].astype("str")
for i in range(len(all_data)):
    if "腹部有压痛" in all_data["0435"][i]:
        temp.append("腹部有压痛")
    elif all_data["0435"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0435"]=temp

temp=[]
all_data["0730"]=all_data["0730"].astype("str")
for i in range(len(all_data)):
    if "义齿" in all_data["0730"][i]:
        temp.append("义齿")
    elif "有" in all_data["0730"][i]:
        temp.append("义齿")
    elif all_data["0730"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0730"]=temp

temp=[]
all_data["1328"]=all_data["1328"].astype("str")
for i in range(len(all_data)):
    if "色弱" in all_data["1328"][i]:
        temp.append("色弱")
    elif "色盲" in all_data["1328"][i]:
        temp.append("色盲")
    elif all_data["1328"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["1328"]=temp

temp=[]
all_data["0210"]=all_data["0210"].astype("str")
for i in range(len(all_data)):
    if "鼻炎" in all_data["0210"][i]:
        temp.append("鼻炎")
    elif "鼻窦炎" in all_data["0210"][i]:
        temp.append("鼻窦炎")
    elif "息肉" in all_data["0210"][i]:
        temp.append("息肉")
    elif "大" in all_data["0210"][i]:
        temp.append("大")
    elif all_data["0210"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0210"]=temp

temp=[]
all_data["0423"]=all_data["0423"].astype("str")
for i in range(len(all_data)):
    if "粗" in all_data["0423"][i]:
        temp.append("粗")
    elif "弱" in all_data["0423"][i]:
        temp.append("弱")
    elif "消失" in all_data["0423"][i]:
        temp.append("消失")
    elif all_data["0423"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0423"]=temp

temp=[]
all_data["0911"]=all_data["0911"].astype("str")
for i in range(len(all_data)):
    if "淋巴结肿大" in all_data["0911"][i]:
        temp.append("淋巴结肿大")
    elif "淋巴结大" in all_data["0911"][i]:
        temp.append("淋巴结肿大")
    elif all_data["0911"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0911"]=temp

temp=[]
all_data["0912"]=all_data["0912"].astype("str")
for i in range(len(all_data)):
    if "不肿大" in all_data["0912"][i]:
        temp.append("正常")
    elif "无肿大" in all_data["0912"][i]:
        temp.append("正常")
    elif "结节" in all_data["0912"][i]:
        temp.append("结节")
    elif "肿大" in all_data["0912"][i]:
        temp.append("肿大")
    elif "欠光滑" in all_data["0912"][i]:
        temp.append("欠光滑")
    elif all_data["0912"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0912"]=temp


temp=[]
all_data["0973"]=all_data["0973"].astype("str")
for i in range(len(all_data)):
    if "已手术" in all_data["0973"][i]:
        temp.append("已手术")
    elif "疝" in all_data["0973"][i]:
        temp.append("疝")
    elif all_data["0973"][i]=='nan':
        temp.append("未查")
    else:
        temp.append("正常")
all_data["0973"]=temp

temp=[]
all_data["0974"]=all_data["0974"].astype("str")
for i in range(len(all_data)):
    if "皮炎" in all_data["0974"][i]:
        temp.append("皮炎")
    elif "癣" in all_data["0974"][i]:
        temp.append("癣")
    elif "疹" in all_data["0974"][i]:
        temp.append("疹")
    elif "银屑病" in all_data["0974"][i]:
        temp.append("银屑病")
    elif "白癜风" in all_data["0974"][i]:
        temp.append("白癜风")
    else:
        temp.append("正常")
all_data["0974"]=temp


In [9]:
columns=["100010","3190","3191","3192","3195","3196","3197","3207","3430","2228","2229","2230",
        "2233","2231","360","3301","3189","3194","3485","3486","2282","30002"]
for col in tqdm(columns):
    temp=[]
    all_data[col]=all_data[col].astype("str")
    for i in range(len(all_data)):
        if "++++" in all_data[col][i]:
            temp.append(4)
        elif "+++" in all_data[col][i]:
            temp.append(3)
        elif "++" in all_data[col][i]:
            temp.append(2)
        elif "+-" in all_data[col][i]:
            temp.append(0.5)
        elif "+" in all_data[col][i]:
            temp.append(1)
        elif "阳性" in all_data[col][i]:
            temp.append(1)
        elif all_data[col][i]=="nan":
            temp.append(np.nan)
        else:
            temp.append(0)
    all_data[col]=temp
    all_data[col]=all_data[col].fillna(all_data[col].mean())
    

100%|██████████| 22/22 [03:31<00:00,  9.60s/it]


In [10]:
columns=["2302"]
for col in columns:
    all_data[col]=all_data[col].replace({"正常":"健康"})
    temp=[]
    for i in range(len(all_data)):
        
        pattern = re.compile(r'[\u4e00-\u9fa5]+')
        try:
            temp.append(pattern.findall(all_data[col][i])[0])
        except:
            temp.append(np.nan)
    all_data[col]=temp
    all_data[col]=all_data[col].fillna("未查")
    all_data[col]=all_data[col].replace({"肥健康":"健康","正常疲劳反应":"健康"})

In [11]:
## 删除结构过于单一和无用features
all_data.drop(["1102","0116","0119","0201","0202","0731","0732","300131","3731","0409","0434","0439",
               "0203","0206","0207","0208","0209","0222","0403","0413","0429","0715","0726","0728","0702",
               "0501","0503","0509","0516","0537","0539","0541","0703","0705","0706","0707","0709","3813",
               "0901","0947","0949","0954","0972","0975","0977","0978","0979","0980","0985","1001",
               "1103","1301","1302","1303","1304","1305","1313","1314","1315","3399","A201","A202",
               "0212","0430","0432","0433","0976","0422","0427","1329","2501","979027","0986","1104",
               "A601","0225","0414","0415","0428","0440","0546","0981","0982","0983","0213","0929",
              "A301","A302","3101","0218","1335","3725","3738","1337","1002","0224","0441","0220",
              "439032"],axis=1,inplace=True)


In [12]:
# all_data.to_csv("../datasets/tianchi/health/v9/all_data_pivot_all_done-v94.csv") #### 除去0101,0102全部处理完成

### 检查是否仍有缺失值

In [13]:
# all_data=pd.read_csv("../datasets/tianchi/health/v9/all_data_pivot_all_done-v94.csv",low_memory=False)
# all_data.set_index("vid",inplace=True)

In [14]:
all_data.drop(["0101","0102","num_items"],axis=1,inplace=True)  ###0101检查项目0102中都有，因此drop

## box cox sknewed data

In [15]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})

skewness = skewness[abs(skewness.Skew) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)


Skew in numerical features: 

There are 247 skewed numerical features to Box Cox transform


### getdummy之后生成最终数据

In [16]:
all_data=pd.get_dummies(all_data,drop_first=True)

X_train=all_data.iloc[:-m_test,:]
X_test=all_data.iloc[-m_test:,:]

### 由于目标函数是log1p的平方差，所以我们对y进行log1p转换

In [17]:
Y_train=pd.read_csv("../datasets/tianchi/health/b_round/Y_train_numeric_done-v9.csv")
Y_train.set_index("vid",inplace=True)
Y_train=np.log(Y_train+1)

In [18]:
assert np.sum(X_train.index!=Y_train.index)==0
assert np.sum(X_test.index!=Y_pred.index)==0
print(X_train.shape,X_test.shape)

(37877, 430) (9532, 430)


### 建立模型 交叉验证性能

In [19]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC,LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error as mse
import xgboost as xgb
import lightgbm as lgb

In [20]:
n_folds=5
def rmse_cv(model,i):
    mse= -cross_val_score(model, X_train.values, Y_train.values[:,i], 
                                   scoring="neg_mean_squared_error", cv = n_folds)
    return(mse)

In [21]:
reg_lasso=make_pipeline(RobustScaler(), Lasso(alpha =0.00015, random_state=1,max_iter=10000))
# for i in range(5):
#     scores=rmse_cv(reg_lasso,i)
#     print("lasso scores {:.4f}(with std: {:.4f})".format(scores.mean(),scores.std()))

In [22]:
reg_ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0015, l1_ratio=.1, max_iter=10000,random_state=3))
# for i in range(5):
#     scores=rmse_cv(reg_ENet,i)
#     print("ENet scores {:.4f}(with std: {:.4f})".format(scores.mean(),scores.std()))

In [23]:
reg_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=20,   
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,n_jobs=4,
                              min_data_in_leaf =16, min_sum_hessian_in_leaf = 11)
# for i in range(3,4):
#     scores=rmse_cv(reg_lgb,i)
#     print("Lightgbm scores {:.5f}(with std: {:.5f})".format(scores.mean(),scores.std()))

Lightgbm scores 0.01278

In [24]:
reg_GDBT = GradientBoostingRegressor(n_estimators=1174, learning_rate=0.015,
                                   max_depth=9, max_features='sqrt',
                                   min_samples_leaf=46, min_samples_split=8,
                                   loss='huber', random_state =10) 
# for i in range(3,4):
#     scores=rmse_cv(reg_GDBT,i)
#     print("GDBT scores {:.5f}(with std: {:.5f})".format(scores.mean(),scores.std()))

GDBT scores 0.0142(with std: 0.0004)  
GDBT scores 0.0182(with std: 0.0005)  
GDBT scores 0.0736(with std: 0.0024)  
GDBT scores 0.0131(with std: 0.0014)  
GDBT scores 0.0369(with std: 0.0023)  

### 查看feature importance 

In [25]:
# def get_importance(model):   
    
#     columns=["Systolic","Diastolic","Glycerin","HDC","LDC"]
#     importance=pd.DataFrame()
#     for col in columns:
#         model.fit(X_train.values,Y_train[col])
#         importance["feature"]=X_train.columns.values
#         importance["importance_"+col]=model.feature_importances_
#     importance=importance.set_index("feature")
    
#     return importance

In [26]:
#importance=get_importance(reg_GDBT)
#importance.to_csv("../datasets/tianchi/health/importance of GDBT-97%.csv",)
#importance.head()

In [27]:
reg_xgb = xgb.XGBRegressor(colsample_bytree=0.7184, 
                           gamma=0.1253,n_estimators=740,n_jobs=4,
                             learning_rate=0.02, max_depth=8,
                             min_child_weight=16.154, reg_alpha=0.2695,
                             subsample=0.8171, silent=1,reg_lambda=0.1855,
                             )
# for i in range(5):
#     scores=rmse_cv(reg_xgb,i)
#     print("XGBoost scores {:.4f}(with std: {:.4f})".format(scores.mean(),scores.std()))

XGboost scores 0.0142(with std: 0.0004)  
XGboost scores 0.0183(with std: 0.0005)  
XGboost scores 0.0728(with std: 0.0025)  
XGboost scores 0.0131(with std: 0.0013)  
XGboost scores 0.0365(with std: 0.0025)  

In [28]:
reg_et=ExtraTreesRegressor(n_estimators=354,max_features=0.3,          
                           max_depth=68,n_jobs=-1,min_samples_split=2,
                             min_samples_leaf=6,random_state=42)
# for i in range(5):
#     scores=rmse_cv(reg_et,i)
#     print("ExtraTrees scores {:.4f}(with std: {:.4f})".format(scores.mean(),scores.std()))

### Try Averaged Model

In [29]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)
            
        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        
        return np.mean(predictions, axis=1)

In [30]:
# averaged_models = AveragingModels(models = (reg_GDBT,reg_lgb,reg_xgb))

# for i in range(3,4):
#     scores=rmse_cv(averaged_models,i)
#     print("AveragingModels scores {:.5f}(with std: {:.5f})".format(scores.mean(),scores.std()))

### try Stacking by Stacknet which is coded by java ！ faster

In [31]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                
                self.base_models_[i].append(instance)

                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [32]:
reg_lasso_stack=make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1,max_iter=10000))

stacked_averaged_models = StackingAveragedModels(base_models = (reg_lgb,reg_GDBT,reg_lasso,reg_ENet,reg_et,reg_xgb),
                                                 meta_model = reg_lasso_stack )
# for i in range(3,4):
#     scores=rmse_cv(stacked_averaged_models,i)
#     print("stacked_averaged_models scores {:.5f}(with std: {:.5f})".format(scores.mean(),scores.std()))

stacked_averaged_models scores 0.0141(with std: 0.0004)  
stacked_averaged_models scores 0.0181(with std: 0.0005)  
stacked_averaged_models scores 0.0723(with std: 0.0026)  
stacked_averaged_models scores 0.0129(with std: 0.0014)  
stacked_averaged_models scores 0.0365(with std: 0.0027)  

### 目前看stacked 和XGB 效果较好
我们直接fit X_train 得出预测结果

In [None]:
stacked_averaged_models.fit(X_train.values,Y_train.values[:,3])
y4_stacked=stacked_averaged_models.predict(X_test.values)
print("y4 done!!")

创建X_train_copy

In [None]:
df_sub=pd.read_csv('../datasets/tianchi/health/b_round/meinian_round1_test_b_20180505.csv',
                       engine='python',encoding="gbk")
df_sub["血清高密度脂蛋白"]=np.exp(y4_stacked)-1

df_sub.to_csv('../datasets/tianchi/health/b_round/4_stacked_xgb.csv',index=False)