In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import math
import scipy as sp
from scipy import stats
from functools import partial
import matplotlib.pyplot as plt
%matplotlib inline

#sklearn生态
#sk辅助
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

#sk模型
from sklearn.linear_model import Ridge
 


#models 
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

#torch生态
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader as DataLoader

In [None]:
#mae指标
def get_mae(pred,true):
    pred_ = []
    true_ = []
    for item in pred:
        pred_.append(np.e**item -1)
    for item in true:
        true_.append(np.e**item -1)
    return mae(true_,pred_)



#临时score，不带入sigma的指标，后续会删除该函数
def score_temp(y_true, y_pred, sigma = 200):
    result = []
    if len(y_true) != len(y_pred):
        return print('size error')
    sq2 = np.sqrt(2.)
    for i in range(len(y_true)):
        gap = abs(y_true[i] - y_pred[i])
        metric = (gap / sigma)*sq2 + np.log(sigma* sq2)
        result.append(metric)
    return np.mean(result)

#score越小越好
def score (y_true, y_pred, confindience):
    result = []
    sq2 = np.sqrt(2.)
    if len(y_true) != len(y_pred) or len(y_true) != len(confindience):
        return print('size error')
    
    for i in range(len(y_pred)):
        sigma = max(70,confindience[i])
        gap = min(1000,abs(y_true[i] - y_pred[i]))
        metric = (gap / sigma)*sq2 + np.log(sigma* sq2)
        result.append(metric)
    return np.mean(result)

In [None]:
train_data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test_data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
#测试集中没有女性，测试集中不存在SmokingStatus == 'Currently smokes'的数据，删除训练集中这样的数据
train_data = train_data[train_data.Sex == 'Male']
train_data = train_data[train_data.SmokingStatus != 'Currently smokes']
#训练集数据去重
train_data.drop_duplicates(keep='first', inplace=True, subset=['Patient','Weeks'])
#所以训练集，测试集相应的特征也不需要了
del train_data['Sex']
del test_data['Sex']


#重新命名，整理训练集，测试集的特征名称
train_data.columns = ['patient','week','FVC','percent','age','SmokingStatus']
test_data.columns = ['patient','base_week','base_FVC','percent','age','SmokingStatus']
submission['predict_week'] = submission['Patient_Week'].apply(lambda x: int(x.split('_')[1]))
submission['patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])

#类别特征数值化
train_data['SmokingStatus'] = train_data['SmokingStatus'].apply(lambda x : 0 if x == 'Ex-smoker' else 1)
test_data['SmokingStatus'] = test_data['SmokingStatus'].apply(lambda x : 0 if x == 'Ex-smoker' else 1)

In [None]:
#构造测试集
test_data = submission.drop(columns=['FVC', 'Confidence']).merge(test_data, on='patient')
test_data['gap_week'] = test_data['predict_week'] - test_data['base_week']
del test_data['predict_week']

In [None]:
#构造训练集
temp = pd.DataFrame()
for _, data_data in tqdm(train_data.groupby('patient')):
    data_temp = pd.DataFrame()
    for week, data_data_data in data_data.groupby('week'):
        data_data_data_temp = data_data_data.drop(columns = 'age')
        data_data_data_temp.columns = ['patient','base_week','base_FVC','base_percent','SmokingStatus']
        del data_data_data_temp['SmokingStatus']
        data_temp_ = data_data.merge(data_data_data_temp, on = 'patient')
        data_temp = pd.concat([data_temp,data_temp_],axis = 0)
    temp = pd.concat([temp,data_temp], axis = 0)  
del train_data
train_data = temp

In [None]:
train_data['gap_week'] = train_data['week'] - train_data['base_week']
del train_data['week']
del train_data['base_week']

In [None]:
test_patient_info = [x for x in test_data['Patient_Week']]

In [None]:
#删除无用的column
del train_data['patient']
del train_data['percent']
del test_data['Patient_Week']
del test_data['patient']
del test_data['base_week']

In [None]:
#整理训练集和测试集，使其columns名称一致
test_data.columns = ['base_FVC','base_percent','age','SmokingStatus','gap_week']

In [None]:
#改变训练集特征的顺序，使其和测试集特征的顺序一致。
train_data_FVC = train_data['FVC']
train_data.drop(labels=['FVC'], axis=1, inplace = True)
train_data.insert(5, 'FVC', train_data_FVC)

train_data_base_FVC = train_data['base_FVC']
train_data.drop(labels=['base_FVC'], axis=1, inplace = True)
train_data.insert(0, 'base_FVC', train_data_base_FVC)

train_data_base_percent = train_data['base_percent']
train_data.drop(labels=['base_percent'], axis=1, inplace = True)
train_data.insert(1, 'base_percent', train_data_base_percent)


In [None]:
#重设训练集和测试集的index
train_data = train_data.reset_index()
test_data = test_data.reset_index()

#删除预测FVC == base_FVC的数据
train_data = train_data.drop(train_data[train_data['gap_week'] == 0].index,axis = 0)
del train_data['index']
del test_data['index']

In [None]:
all_data = pd.concat([train_data, test_data],axis = 0).reset_index(drop = True)

In [None]:
#组合特征
for i in tqdm(['base_FVC','base_percent','age','gap_week']):
    for j in ['base_FVC','base_percent','age','gap_week']:
        if i != j:
            column_name = '{}_{}_multi'.format(i,j)
            all_data[column_name] = all_data[i] * np.max(all_data[j]) + all_data[j]
            column_name = '{}_{}divi'.format(i,j)
            all_data[column_name] = (all_data[i] / all_data[j]).astype(np.float)

In [2]:
#暂时不进行数据归一化，模型如需要再单独做归一化

In [None]:
train_data = all_data[0:8994]
test_data = all_data[8994:len(all_data)]
train_y = train_data['FVC'].values
del train_data['FVC']
train_x = train_data.values
del test_data['FVC']
test_x = test_data.values

In [None]:
#处理一下预测值的偏度
train_y = np.log1p(train_y)

In [3]:
#模型部分
#调参的代码就不放上来了，没啥意义，而且很多调试过的代码我都删掉了。

In [5]:
#lightgbm

In [None]:
kf = KFold(n_splits=5, shuffle=False)
models = []
valid_gap = [0 for _ in range(len(train_y))]
models_weight = []
for cnt, (tr_idx, val_idx) in tqdm(enumerate(kf.split(train_x)), total=5):
    print('---------------FOLD:{} START----------------'.format(cnt))
    kfold_train_x = train_x[tr_idx]
    kfold_train_y = train_y[tr_idx]
    kfold_valid_x = train_x[val_idx]
    kfold_valid_y = train_y[val_idx]
    #训练fold模型
    model_lgb = lgb.LGBMRegressor(objective='regression', 
                              metric='mse',
                                  
                              learning_rate=0.01,
                              n_estimators=637,
                              max_depth =4, 
                              num_leaves  = 10,
                               

                              feature_fraction = 0.6,
                              subsample = 0.7,
                              subsample_freq = 7,

                              min_child_samples=122,
                              min_split_gain=0,
                              min_child_weight=0,

                              reg_alpha =0.001,
                              reg_lambda = 2
                             )
    model_lgb.fit(X=kfold_train_x, 
                  y=kfold_train_y, 
                  eval_set=[(kfold_train_x, kfold_train_y), (kfold_valid_x, kfold_valid_y)],
                  eval_names=['train loss', 'valid loss'], 
                  eval_metric='mse',
                  verbose=10,
                  early_stopping_rounds=300,
                  )
    valid_predict = np.array(model_lgb.predict(kfold_valid_x))
    
    errors = []
    for i in range(len(valid_predict)):
        predict_ = np.e ** valid_predict[i] -1
        true_ = np.e ** kfold_valid_y[i] -1
        gap_ = abs(predict_ - true_)
        errors.append(gap_)
        
    for i in range(len(errors)):
        index = val_idx[i]
        valid_gap[index] = errors[i]
    models_weight.append(1/np.mean(valid_gap))
        
    models.append(model_lgb)

In [None]:
models_weight = [x/np.sum(models_weight) for x in models_weight]

In [None]:
all_predict = [0 for _ in range(len(test_x))]
for index in range(len(models)):
    model = models[index]
    predict = model.predict(test_x)
    for i in range(len(test_x)):
        all_predict[i] += predict[i] * 0.2 #models_weight[index]
        

all_predict = [np.e**x-1 for x in all_predict]

submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
pred_info = [x for x in submission['Patient_Week']]
result = [0 for _ in range(len(submission))]

for i in range(len(submission)):
    pred = all_predict[i]
    target = test_patient_info[i]
    
    index = pred_info.index(target)
    result[index] = pred

submission['FVC'] = result
submission['Confidence'] = 200
submission.to_csv('submission_lgb.csv',index = False)

In [None]:
#xgboost

In [None]:
#这里存在一个小问题，临时模型的名称还是lgb，实际上是xgb。因为这块的代码是我从上面lgb直接copy过来的，临时名称懒得改了。
#如果做工程，需要写个函数，统一一下，会规范很多。

In [None]:
kf = KFold(n_splits=5, shuffle=False)
models = []
valid_gap = [0 for _ in range(len(train_y))]
models_weight = []
for cnt, (tr_idx, val_idx) in tqdm(enumerate(kf.split(train_x)), total=5):
    print('---------------FOLD:{} START----------------'.format(cnt))
    kfold_train_x = train_x[tr_idx]
    kfold_train_y = train_y[tr_idx]
    kfold_valid_x = train_x[val_idx]
    kfold_valid_y = train_y[val_idx]
    #训练fold模型
    model_lgb = xgb.XGBRegressor(random_state  = 2020,
                             booster='gbtree',
                             
                             learning_rate = 0.01,
                             n_estimators = 780,
                             
                             max_depth = 5,
                             
                             colsample_bytree = 0.5,
                             colsample_bylevel =0.8,
                             colsample_bynode= 1.0,
                             subsample = 0.4,
                             
                             gamma = 0.09,
                             min_child_weight = 80,
                             
                             
                             reg_alpha = 0.01,
                             reg_lambda = 3


                                )
    model_lgb.fit(X=kfold_train_x, 
                  y=kfold_train_y, 
                  eval_set=[(kfold_train_x,kfold_train_y),(kfold_valid_x, kfold_valid_y)],
                  
                  
                  verbose=10,
                  early_stopping_rounds=300,
                  )
    valid_predict = np.array(model_lgb.predict(kfold_valid_x))
    
    errors = []
    for i in range(len(valid_predict)):
        predict_ = np.e ** valid_predict[i] -1
        true_ = np.e ** kfold_valid_y[i] -1
        gap_ = abs(predict_ - true_)
        errors.append(gap_)
        
    for i in range(len(errors)):
        index = val_idx[i]
        valid_gap[index] = errors[i]
    models_weight.append(1/np.mean(valid_gap))
        
    models.append(model_lgb)

In [None]:
models_weight = [x/np.sum(models_weight) for x in models_weight]

In [None]:
all_predict = [0 for _ in range(len(test_x))]
for index in range(len(models)):
    model = models[index]
    predict = model.predict(test_x)
    for i in range(len(test_x)):
        all_predict[i] += predict[i] * 0.2 #models_weight[index]
        

all_predict = [np.e**x-1 for x in all_predict]

submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
pred_info = [x for x in submission['Patient_Week']]
result = [0 for _ in range(len(submission))]

for i in range(len(submission)):
    pred = all_predict[i]
    target = test_patient_info[i]
    
    index = pred_info.index(target)
    result[index] = pred

submission['FVC'] = result
submission['Confidence'] = 200
submission.to_csv('submission_xtb.csv',index = False)

In [4]:
#catboost

In [None]:
kf = KFold(n_splits=5, shuffle=False)
models = []
valid_gap = [0 for _ in range(len(train_y))]
models_weight = []
for cnt, (tr_idx, val_idx) in tqdm(enumerate(kf.split(train_x)), total=5):
    print('---------------FOLD:{} START----------------'.format(cnt))
    kfold_train_x = train_x[tr_idx]
    kfold_train_y = train_y[tr_idx]
    kfold_valid_x = train_x[val_idx]
    kfold_valid_y = train_y[val_idx]
    #训练fold模型
    model_ctb = ctb.CatBoostRegressor(random_state = 2020,
    
                                  learning_rate=0.01,
                                  
                                  n_estimators = 576,
                                  depth = 5,
                                  

                                  colsample_bylevel = 0.4,
                                  subsample = 0.3,
                                  bagging_temperature = 0.5,
                                  
                                  min_child_samples = 0,
                                  
                                  reg_lambda = 17.4,
                                  
                                  

                                 )
    model_ctb.fit(X=kfold_train_x, 
                  y=kfold_train_y, 
                  eval_set=[(kfold_train_x,kfold_train_y),(kfold_valid_x, kfold_valid_y)],
                  
                  
                  verbose=10,
                  early_stopping_rounds=300,
                  )
    valid_predict = np.array(model_ctb.predict(kfold_valid_x))
    
    errors = []
    for i in range(len(valid_predict)):
        predict_ = np.e ** valid_predict[i] -1
        true_ = np.e ** kfold_valid_y[i] -1
        gap_ = abs(predict_ - true_)
        errors.append(gap_)
        
    for i in range(len(errors)):
        index = val_idx[i]
        valid_gap[index] = errors[i]
    models_weight.append(1/np.mean(valid_gap))
        
    models.append(model_ctb)

In [None]:
models_weight = [x/np.sum(models_weight) for x in models_weight]

In [None]:
all_predict = [0 for _ in range(len(test_x))]
for index in range(len(models)):
    model = models[index]
    predict = model.predict(test_x)
    for i in range(len(test_x)):
        all_predict[i] += predict[i] * 0.2 #models_weight[index]
        

all_predict = [np.e**x-1 for x in all_predict]

submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
pred_info = [x for x in submission['Patient_Week']]
result = [0 for _ in range(len(submission))]

for i in range(len(submission)):
    pred = all_predict[i]
    target = test_patient_info[i]
    
    index = pred_info.index(target)
    result[index] = pred

submission['FVC'] = result
submission['Confidence'] = 200
submission.to_csv('submission_ctb.csv',index = False)

In [None]:
#最终的提交结果是几个模型预测结果的线性融合，融合的系数的调试没有技术含量 ，就不放上来了。