In [36]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold

# 模型的包
from sklearn.linear_model import Perceptron,SGDClassifier,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)


In [2]:
train=pd.read_csv("./data/train.csv")#(891,12)
test=pd.read_csv("./data/test.csv")#(418,11)
train['is_train']=1
test['is_train']=0
data=pd.concat([train,test])

In [65]:
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,is_train,Cabins,familyNum,has_family,CabinA,CabinB,CabinC,CabinD,CabinE,CabinF,CabinG,CabinT,CabinZ,EmbarkedC,EmbarkedQ,EmbarkedS,TitleMaster,TitleMiss,TitleMr,TitleMrs,TitleRare,Sexfemale,Sexmale,female1,female2,female3,male1,male2,male3,Age_range
0,0.0,3,22.0,1,0,7.25,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,2
1,1.0,1,38.0,1,0,71.2833,1,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,2
2,1.0,3,26.0,0,0,7.925,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,2
3,1.0,1,35.0,1,0,53.1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,2
4,0.0,3,35.0,0,0,8.05,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,2


# 数据预处理
## 数据预处理

In [3]:
def getAgeRange(x):
    if 0<=x<16:return 1
    elif 16<=x<40:return 2
    else: return 3
data['Age_range']=data['Age'].apply(getAgeRange)

In [4]:
data['SexPclass']=data['Sex'].str.cat(data['Pclass'].map(lambda x:str(x)))

In [5]:
data['Fare']=data['Fare'].fillna(data['Fare'].median())# 有一个空值，中位数填充

data['Embarked']=data['Embarked'].fillna("S")# 就2个缺失值，填充了众数

# Cabin只有204个非空值
data['Cabins']=data['Cabin'].apply(lambda x:2 if len(str(x))>4 else(1 if pd.notna(x) else 0) )
data['Cabin']=data['Cabin'].str[:1]
data['Cabin']=data['Cabin'].fillna("Z")

# ticket比较复杂，先算了
# train['Ticket'].apply(lambda x:x.split(" ")[0]).value_counts()

data['Title']=data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt','Don','Col','Dr','Rev', 'Major', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace('Mlle', 'Miss')
data['Title'] = data['Title'].replace('Ms', 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

data['familyNum']=data['SibSp']+data['Parch']
data['has_family']=data['familyNum'].apply(lambda x:1 if x>0 else 0)

data=data.drop(['Name','Ticket','PassengerId'],axis=1)

In [55]:
# ohe = OneHotEncoder(handle_unknown='ignore')

# ohe.fit(train[['Cabin','Embarked','Title','Sex']])#训练规则
# feature_names=ohe.get_feature_names(['Cabin','Embarked','Title','Sex'])#获取编码后的特征名
# data_train_onehot=pd.DataFrame(ohe.transform(train[['Cabin','Embarked','Title','Sex']]).toarray(),columns=feature_names)#应用规则在训练集上
 
# data_new_onehot=pd.DataFrame(ohe.transform(test[['Cabin','Embarked','Title','Sex']]).toarray(),columns=feature_names)#应用规则在预测集上

In [6]:
for col in ['Cabin','Embarked','Title','Sex','Age_range']:
    df=pd.get_dummies(data[col],sparse=True)
    for c in df.columns:
        df.rename(columns={c:col+str(c)},inplace=True)
    data=pd.concat([data.drop([col],axis=1),df],axis=1)

df=pd.get_dummies(data['SexPclass'],sparse=True)
data=pd.concat([data.drop(['SexPclass'],axis=1),df],axis=1)

# data=data.drop(['CabinA','CabinD','CabinF','CabinG','CabinT','Sexmale'],axis=1)

## 预测测试集是否生存，用来填充缺失值

In [79]:
X_train=np.array(data[data['is_train']==1].drop(['Survived','is_train'],axis=1))
y_train=np.array(data[data['is_train']==1]['Survived'])
X_test=np.array(data[data['is_train']==0].drop(['Survived','is_train'],axis=1))

In [80]:
from lightgbm import LGBMClassifier

best_model=None
folds=5
avg_score=0
avg_ccore_train=0
best_score=0
result=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=LGBMClassifier(learning_rate=0.07,max_depth=20,n_estimators=70)
    lgb.fit(X_train[train_index],y_train[train_index])
    
    score_train=lgb.score(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print('{}th fold train：{:.3f}, test：{:.3f}'.format(i,score_train,score))
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score+=score
    avg_ccore_train+=score_train
    result+=lgb.predict(X_test)
avg_score/=folds
avg_ccore_train/=folds
result/=folds

print("avg train score= {:.3f}, avg score={:.3f}".format(avg_ccore_train,avg_score))

0th fold train：0.927, test：0.816
1th fold train：0.933, test：0.770
2th fold train：0.920, test：0.803
3th fold train：0.903, test：0.888
4th fold train：0.917, test：0.848
avg train score= 0.920, avg score=0.825


In [62]:
data[data['is_train']==1].drop(['Survived','is_train'],axis=1).columns# CabinA,CabinD,CabinF,CabinG,CabinT,Sexmale

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabins', 'familyNum',
       'has_family', 'CabinA', 'CabinB', 'CabinC', 'CabinD', 'CabinE',
       'CabinF', 'CabinG', 'CabinT', 'CabinZ', 'EmbarkedC', 'EmbarkedQ',
       'EmbarkedS', 'TitleMaster', 'TitleMiss', 'TitleMr', 'TitleMrs',
       'TitleRare', 'Sexfemale', 'Sexmale', 'female1', 'female2', 'female3',
       'male1', 'male2', 'male3'],
      dtype='object')

In [63]:
lgb.feature_importances_

array([ 47, 711,  31,  30, 676,  22,  57,   1,   0,   6,  18,   0,   4,
         0,   0,   0,   5,  25,  10,  59,  15,  21,  26,  22,   6,  27,
         0,  17,  16,   8,   2,   9,  19])

## age缺失值填充
### 用lgb预测

In [8]:
X_test_pred=pd.read_csv("./results/result_20220522_ss.txt").iloc[:,1]# 历史最好成绩survival预测成绩
# X_test_pred=best_model.predict(X_test)
data.iloc[891:,0]=X_test_pred
data=data.reset_index(drop=True)

In [9]:
data_age=data[data['Age'].notnull()]
data_noAge=data[data['Age'].isnull()]

X_train_age=np.array(data_age.drop(['is_train','Age'],axis=1))
y_train_age=np.array(data_age['Age'])
X_test_age=np.array(data_noAge.drop(['is_train','Age'],axis=1))

In [10]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

best_model=None
folds=5
avg_score=0
best_score=0

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train_age)):
    lgb=LGBMRegressor(learning_rate=0.5,max_depth=5,n_estimators=7)
    lgb.fit(X_train_age[train_index],y_train_age[train_index])
    
    
    pred=lgb.predict(X_train_age[train_index])
    score0=mean_squared_error(pred,y_train_age[train_index])
    print(i,'th fold 0：',score0)
    
    pred=lgb.predict(X_train_age[test_index])
    score=mean_squared_error(pred,y_train_age[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score+=score
avg_score/=folds

print("avg score= ",avg_score)

0 th fold 0： 32.50407328911594
0 th fold： 42.29508833216884
1 th fold 0： 34.33495307804861
1 th fold： 42.189574012097964
2 th fold 0： 33.449934192719425
2 th fold： 45.4152860702446
3 th fold 0： 33.5138219126721
3 th fold： 43.22114768003491
4 th fold 0： 32.72438632536917
4 th fold： 43.36415271695179
avg score=  43.29704976229962


In [11]:
age_pred=best_model.predict(X_test_age)
data.iloc[data[data['Age'].isnull()].index,2]=age_pred

# 模型训练

In [12]:
X_train=np.array(data[data['is_train']==1].drop(['Survived','is_train'],axis=1))
y_train=np.array(data[data['is_train']==1]['Survived'])
X_test=np.array(data[data['is_train']==0].drop(['Survived','is_train'],axis=1))

In [108]:
# 模型函数
def train_model(model,params,X_train,y_train,X_test):
    folds=5
    train_Acc,valid_Acc=0,0
    train_pred,test_pred=np.zeros(891),np.zeros(418)

    kf=StratifiedKFold(n_splits=folds,shuffle=True,random_state=2022)
    model.set_params(**params)
    for i, (train_index,valid_index) in enumerate(kf.split(X_train,y_train)):
        model.fit(X_train[train_index],y_train[train_index])

        train_score=model.score(X_train[train_index],y_train[train_index])
        valid_score=model.score(X_train[valid_index],y_train[valid_index])
        print('{}th fold train：{:.3f}, valid：{:.3f}'.format(i,train_score,valid_score))

        valid_Acc+=valid_score
        train_Acc+=train_score
        try:
            test_pred+=model.predict_proba(X_test)[:,1]
            train_pred[valid_index]=model.predict_proba(X_train[valid_index])[:,1]
        except:
            test_pred+=model.predict(X_test)
            train_pred[valid_index]=model.predict(X_train[valid_index])

    valid_Acc/=folds
    train_Acc/=folds
    test_pred/=folds
    print("avg train score= {:.3f}, avg valid score={:.3f}".format(train_Acc,valid_Acc))
    
    return train_pred,test_pred,valid_Acc

## lgb

In [89]:
model=LGBMClassifier()
params={"learning_rate":0.07,"max_depth":12,"n_estimators":80}
train_pred_lgb,test_pred_lgb,valid_acc_lgb=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.923, test：0.832
1th fold train：0.914, test：0.848
2th fold train：0.916, test：0.843
3th fold train：0.924, test：0.820
4th fold train：0.924, test：0.860
avg train score= 0.920, avg score=0.841


## xgb

In [90]:
model=XGBClassifier()
params={"learning_rate":0.05,"max_depth":16,"n_estimators":73}
train_pred_xgb,test_pred_xgb,valid_acc_xgb=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.937, test：0.838
1th fold train：0.941, test：0.871
2th fold train：0.927, test：0.815
3th fold train：0.935, test：0.820
4th fold train：0.938, test：0.865
avg train score= 0.936, avg score=0.842


## catboost

In [91]:
model=CatBoostClassifier()
params={"learning_rate":0.08,"n_estimators":82}
train_pred_cat,test_pred_cat,valid_acc_cat=train_model(model,params,X_train,y_train,X_test)

0:	learn: 0.6510113	total: 15.6ms	remaining: 1.26s
1:	learn: 0.6133505	total: 17.3ms	remaining: 691ms
2:	learn: 0.5811610	total: 18.6ms	remaining: 489ms
3:	learn: 0.5453595	total: 19.8ms	remaining: 386ms
4:	learn: 0.5209648	total: 21.2ms	remaining: 326ms
5:	learn: 0.5020514	total: 22.4ms	remaining: 284ms
6:	learn: 0.4866192	total: 23.6ms	remaining: 253ms
7:	learn: 0.4697546	total: 24.8ms	remaining: 229ms
8:	learn: 0.4533939	total: 26.1ms	remaining: 211ms
9:	learn: 0.4408463	total: 27.4ms	remaining: 198ms
10:	learn: 0.4302414	total: 28.7ms	remaining: 185ms
11:	learn: 0.4229265	total: 29.9ms	remaining: 174ms
12:	learn: 0.4157579	total: 31ms	remaining: 165ms
13:	learn: 0.4091516	total: 32.1ms	remaining: 156ms
14:	learn: 0.4030553	total: 33.3ms	remaining: 149ms
15:	learn: 0.3978494	total: 34.5ms	remaining: 142ms
16:	learn: 0.3909924	total: 35.7ms	remaining: 136ms
17:	learn: 0.3870642	total: 36.8ms	remaining: 131ms
18:	learn: 0.3821718	total: 38ms	remaining: 126ms
19:	learn: 0.3765212	total

36:	learn: 0.3321895	total: 46.3ms	remaining: 56.3ms
37:	learn: 0.3312400	total: 47.6ms	remaining: 55.1ms
38:	learn: 0.3292916	total: 48.9ms	remaining: 53.9ms
39:	learn: 0.3274829	total: 50ms	remaining: 52.5ms
40:	learn: 0.3257858	total: 51.1ms	remaining: 51.1ms
41:	learn: 0.3247253	total: 52.3ms	remaining: 49.8ms
42:	learn: 0.3232847	total: 53.5ms	remaining: 48.5ms
43:	learn: 0.3220153	total: 54.6ms	remaining: 47.1ms
44:	learn: 0.3209593	total: 55.7ms	remaining: 45.8ms
45:	learn: 0.3196726	total: 57ms	remaining: 44.6ms
46:	learn: 0.3186319	total: 58.1ms	remaining: 43.3ms
47:	learn: 0.3179370	total: 59.3ms	remaining: 42ms
48:	learn: 0.3162861	total: 60.6ms	remaining: 40.8ms
49:	learn: 0.3148681	total: 61.9ms	remaining: 39.6ms
50:	learn: 0.3137347	total: 63.1ms	remaining: 38.4ms
51:	learn: 0.3131910	total: 64.3ms	remaining: 37.1ms
52:	learn: 0.3113711	total: 65.6ms	remaining: 35.9ms
53:	learn: 0.3108866	total: 66.9ms	remaining: 34.7ms
54:	learn: 0.3101254	total: 68.2ms	remaining: 33.5ms

69:	learn: 0.3147317	total: 104ms	remaining: 17.8ms
70:	learn: 0.3135985	total: 105ms	remaining: 16.3ms
71:	learn: 0.3129352	total: 106ms	remaining: 14.8ms
72:	learn: 0.3121112	total: 108ms	remaining: 13.3ms
73:	learn: 0.3110037	total: 109ms	remaining: 11.8ms
74:	learn: 0.3098939	total: 110ms	remaining: 10.3ms
75:	learn: 0.3088949	total: 111ms	remaining: 8.77ms
76:	learn: 0.3081278	total: 112ms	remaining: 7.3ms
77:	learn: 0.3077805	total: 114ms	remaining: 5.82ms
78:	learn: 0.3071031	total: 115ms	remaining: 4.36ms
79:	learn: 0.3056084	total: 116ms	remaining: 2.9ms
80:	learn: 0.3046020	total: 117ms	remaining: 1.45ms
81:	learn: 0.3037455	total: 119ms	remaining: 0us
4th fold train：0.886, test：0.865
avg train score= 0.898, avg score=0.829


## gbt

In [92]:
model=GradientBoostingClassifier()
params={"learning_rate":0.11,"n_estimators":222}
train_pred_gbt,test_pred_gbt,valid_acc_gbt=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.949, test：0.827
1th fold train：0.948, test：0.860
2th fold train：0.955, test：0.826
3th fold train：0.947, test：0.803
4th fold train：0.950, test：0.843
avg train score= 0.950, avg score=0.832


## RF

In [93]:
model=RandomForestClassifier()
params={"n_estimators":150,"max_depth":4,"min_samples_split":5,"min_samples_leaf":3}
train_pred_rf,test_pred_rf,valid_acc_rf=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.834, test：0.844
1th fold train：0.832, test：0.831
2th fold train：0.835, test：0.815
3th fold train：0.836, test：0.770
4th fold train：0.830, test：0.854
avg train score= 0.833, avg score=0.823


## lr

In [94]:
model=LogisticRegression()
params={}
train_pred_lr,test_pred_lr,valid_acc_lr=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.840, test：0.832
1th fold train：0.847, test：0.860
2th fold train：0.849, test：0.820
3th fold train：0.857, test：0.815
4th fold train：0.836, test：0.876
avg train score= 0.846, avg score=0.841


## SVC

In [95]:
model=LinearSVC()
params={"C":0.12,"tol":1e-4,"max_iter":1000}
train_pred_svc,test_pred_svc,valid_acc_svc=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.838, test：0.838
1th fold train：0.837, test：0.854
2th fold train：0.846, test：0.815
3th fold train：0.849, test：0.781
4th fold train：0.826, test：0.876
avg train score= 0.839, avg score=0.833


## knn

In [96]:
model=KNeighborsClassifier()
params={"n_neighbors":15,"p":1}
train_pred_knn,test_pred_knn,valid_acc_knn=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.816, test：0.810
1th fold train：0.811, test：0.815
2th fold train：0.819, test：0.736
3th fold train：0.806, test：0.792
4th fold train：0.804, test：0.831
avg train score= 0.811, avg score=0.797


## perceptron

In [97]:
model=Perceptron()
params={"penalty":'l2',"alpha":1e-5,"max_iter":19,"validation_fraction":0.1,"n_iter_no_change":10,"warm_start":True}
train_pred_pt,test_pred_pt,valid_acc_pt=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.779, test：0.810
1th fold train：0.767, test：0.781
2th fold train：0.809, test：0.787
3th fold train：0.804, test：0.753
4th fold train：0.784, test：0.815
avg train score= 0.789, avg score=0.789


## SGD

In [98]:
model=SGDClassifier()
params={"loss":'log',"alpha":1e-5,"max_iter":20,"validation_fraction":0.1,"n_iter_no_change":10,"warm_start":True}
train_pred_sgd,test_pred_sgd,valid_acc_sgd=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.802, test：0.821
1th fold train：0.798, test：0.809
2th fold train：0.771, test：0.753
3th fold train：0.759, test：0.719
4th fold train：0.721, test：0.770
avg train score= 0.770, avg score=0.774


## gp

In [99]:
model=GaussianProcessClassifier()
params={}
train_pred_gp,test_pred_gp,valid_acc_gp=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.963, test：0.749
1th fold train：0.961, test：0.747
2th fold train：0.965, test：0.708
3th fold train：0.959, test：0.725
4th fold train：0.966, test：0.798
avg train score= 0.963, avg score=0.745


##  bayes

In [100]:
model=BernoulliNB()
params={"alpha":20}
train_pred_by,test_pred_by,valid_acc_by=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.788, test：0.816
1th fold train：0.795, test：0.803
2th fold train：0.801, test：0.781
3th fold train：0.808, test：0.736
4th fold train：0.785, test：0.837
avg train score= 0.795, avg score=0.795


# 模型融合
## 简单平均

In [103]:
result1=(test_pred_lgb+test_pred_xgb+test_pred_cat+test_pred_gbt+test_pred_rf+test_pred_lr+test_pred_svc\
         +test_pred_knn+test_pred_pt+test_pred_sgd+test_pred_gp+test_pred_by)/12

In [198]:
result1_round=np.round(result1)
pd.DataFrame(data=np.c_[list(range(892,1310)),result1_round],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220522_avg_bestSuvivalAge.txt',index=False,header=True)# 得分：0.73444

In [470]:
np.where(np.round(result2)!=np.round(result1))

(array([234, 280], dtype=int64),)

In [200]:
avg_0519=pd.read_csv("result_20220519_avg.txt")["Survived"].values

In [104]:
np.where(np.round(result2)!=np.round(result1))

(array([], dtype=int64),)

## 加权平均

In [101]:
result2=(valid_acc_lgb*test_pred_lgb\
        +valid_acc_xgb*test_pred_xgb\
        +valid_acc_cat*test_pred_cat\
        +valid_acc_gbt*test_pred_gbt\
        +valid_acc_rf*test_pred_rf\
        +valid_acc_lr*test_pred_lr\
        +valid_acc_svc*test_pred_svc\
        +valid_acc_knn*test_pred_knn\
        +valid_acc_pt*test_pred_pt\
        +valid_acc_sgd*test_pred_sgd\
        +valid_acc_gp*test_pred_gp\
        +valid_acc_by*test_pred_by)/(valid_acc_lgb+valid_acc_xgb+valid_acc_cat+valid_acc_gbt+valid_acc_rf+valid_acc_lr+valid_acc_svc\
                                 +valid_acc_knn+valid_acc_pt+valid_acc_sgd+valid_acc_gp+valid_acc_by)

In [328]:
np.where(result1_round!=result2_round)# 简单平均和加权平均结果一样

(array([], dtype=int64),)

## stacking

In [106]:
train_stacking=np.c_[train_pred_lgb,train_pred_xgb,train_pred_cat,train_pred_gbt,train_pred_rf,train_pred_lr,train_pred_svc,\
      train_pred_knn,train_pred_pt,train_pred_sgd,train_pred_gp,train_pred_by]
y_train_stacking=y_train.copy()
test_stacking=np.c_[test_pred_lgb,test_pred_xgb,test_pred_cat,test_pred_gbt,test_pred_rf,test_pred_lr,test_pred_svc,\
      test_pred_knn,test_pred_pt,test_pred_sgd,test_pred_gp,test_pred_by]

In [333]:
# 带上原数据的stacking
train_stacking=np.c_[X_train,train_result_lgb,train_result_xgb,train_result_cat,train_result_gbt,train_result_rf,train_result_lr,train_result_svc,\
      train_result_knn,train_result_pt,train_result_sgd,train_result_gp,train_result_by]
y_train_stacking=y_train.copy()
test_stacking=np.c_[X_test,result_lgb,result_xgb,result_cat,result_gbt,result_rf,result_lr,result_svc,\
      result_knn,result_pt,result_sgd,result_gp,result_by]

### rf

In [110]:
model=RandomForestClassifier()
params={"n_estimators":70,"max_depth":8,"min_samples_split":4,"min_samples_leaf":3}
train_s_pred_rf,test_s_pred_rf,valid_s_acc_rf=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.879, valid：0.838
1th fold train：0.864, valid：0.843
2th fold train：0.885, valid：0.820
3th fold train：0.881, valid：0.787
4th fold train：0.872, valid：0.876
avg train score= 0.876, avg valid score=0.833


In [143]:
np.where(np.round(result_rf_s)!=np.round(result1))

(array([  4,   6,  19,  21,  32,  33,  36,  37,  86, 125, 138, 158, 159,
        165, 169, 181, 192, 197, 199, 225, 242, 249, 252, 268, 280, 283,
        293, 323, 347, 359, 367, 382, 383, 412], dtype=int64),)

In [167]:
pd.DataFrame(data=np.c_[list(range(892,1310)),result_lr_s],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220520_rf_s.txt',index=False,header=True)# 得分：0.73444

### lr

In [111]:
model=RandomForestClassifier()
params={}
train_s_pred_lr,test_s_pred_lr,valid_s_acc_lr=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.990, valid：0.821
1th fold train：0.987, valid：0.843
2th fold train：0.992, valid：0.820
3th fold train：0.987, valid：0.770
4th fold train：0.990, valid：0.820
avg train score= 0.989, avg valid score=0.815


In [145]:
np.where(np.round(result_lr_s)!=np.round(result1))

(array([  6,  19,  28,  36,  37,  41, 138, 158, 165, 169, 225, 242, 249,
        252, 268, 280, 283, 323, 347, 412], dtype=int64),)

### lgb

In [122]:
model=LGBMClassifier()
params={"learning_rate":0.1,"max_depth":10,"n_estimators":100}
train_s_pred_lgb,test_s_pred_lgb,valid_s_acc_lgb=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.947, valid：0.838
1th fold train：0.947, valid：0.860
2th fold train：0.952, valid：0.837
3th fold train：0.959, valid：0.809
4th fold train：0.948, valid：0.860
avg train score= 0.951, avg valid score=0.841


In [457]:
pd.DataFrame(data=np.c_[list(range(892,1310)),np.round(result_lgb_s)],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220522_sso_lgb.txt',index=False,header=True)# 得分：0.73444

### cat

In [139]:
model=CatBoostClassifier()
params={"learning_rate":0.06,"n_estimators":70}
train_s_pred_cat,test_s_pred_cat,valid_s_acc_cat=train_model(model,params,X_train,y_train,X_test)

0:	learn: 0.6610697	total: 21.9ms	remaining: 1.51s
1:	learn: 0.6311421	total: 23.7ms	remaining: 805ms
2:	learn: 0.6033573	total: 37.6ms	remaining: 840ms
3:	learn: 0.5729409	total: 39.4ms	remaining: 651ms
4:	learn: 0.5508827	total: 41.5ms	remaining: 539ms
5:	learn: 0.5327376	total: 42.8ms	remaining: 457ms
6:	learn: 0.5175350	total: 44.1ms	remaining: 397ms
7:	learn: 0.5009654	total: 45.3ms	remaining: 351ms
8:	learn: 0.4847992	total: 46.5ms	remaining: 315ms
9:	learn: 0.4715334	total: 47.7ms	remaining: 286ms
10:	learn: 0.4663076	total: 48.4ms	remaining: 260ms
11:	learn: 0.4566568	total: 49.5ms	remaining: 239ms
12:	learn: 0.4477544	total: 50.6ms	remaining: 222ms
13:	learn: 0.4401259	total: 51.7ms	remaining: 207ms
14:	learn: 0.4333776	total: 52.9ms	remaining: 194ms
15:	learn: 0.4266652	total: 54.1ms	remaining: 183ms
16:	learn: 0.4195312	total: 55.3ms	remaining: 172ms
17:	learn: 0.4153514	total: 56.5ms	remaining: 163ms
18:	learn: 0.4113014	total: 57.7ms	remaining: 155ms
19:	learn: 0.4078607	t

41:	learn: 0.3401382	total: 65.9ms	remaining: 44ms
42:	learn: 0.3395426	total: 67.2ms	remaining: 42.2ms
43:	learn: 0.3382731	total: 68.4ms	remaining: 40.4ms
44:	learn: 0.3369495	total: 69.5ms	remaining: 38.6ms
45:	learn: 0.3358904	total: 70.6ms	remaining: 36.8ms
46:	learn: 0.3348484	total: 71.7ms	remaining: 35.1ms
47:	learn: 0.3343818	total: 72.9ms	remaining: 33.4ms
48:	learn: 0.3334072	total: 74.1ms	remaining: 31.8ms
49:	learn: 0.3325843	total: 75.3ms	remaining: 30.1ms
50:	learn: 0.3317454	total: 76.5ms	remaining: 28.5ms
51:	learn: 0.3303710	total: 77.7ms	remaining: 26.9ms
52:	learn: 0.3289884	total: 78.9ms	remaining: 25.3ms
53:	learn: 0.3277937	total: 80ms	remaining: 23.7ms
54:	learn: 0.3273666	total: 81.2ms	remaining: 22.1ms
55:	learn: 0.3270714	total: 82ms	remaining: 20.5ms
56:	learn: 0.3263741	total: 83.1ms	remaining: 19ms
57:	learn: 0.3251710	total: 84.4ms	remaining: 17.5ms
58:	learn: 0.3243718	total: 85.6ms	remaining: 16ms
59:	learn: 0.3231574	total: 86.9ms	remaining: 14.5ms
60:

### svc

In [141]:
model=LinearSVC()
params={"C":0.008,"tol":1e-4,"max_iter":100}
train_s_pred_svc,test_s_pred_svc,valid_s_acc_svc=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.826, valid：0.832
1th fold train：0.813, valid：0.809
2th fold train：0.830, valid：0.787
3th fold train：0.830, valid：0.753
4th fold train：0.812, valid：0.860
avg train score= 0.822, avg valid score=0.808


### knn

In [148]:
model=KNeighborsClassifier()
params={"n_neighbors":15,"p":1}
train_pred_knn,test_pred_knn,valid_acc_knn=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.816, valid：0.810
1th fold train：0.811, valid：0.815
2th fold train：0.819, valid：0.736
3th fold train：0.806, valid：0.792
4th fold train：0.804, valid：0.831
avg train score= 0.811, avg valid score=0.797


### gbt

In [149]:
model=GradientBoostingClassifier()
params={"learning_rate":0.06,"n_estimators":73}
train_pred_gbt,test_pred_gbt,valid_acc_gbt=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.889, valid：0.832
1th fold train：0.877, valid：0.843
2th fold train：0.884, valid：0.831
3th fold train：0.885, valid：0.792
4th fold train：0.874, valid：0.871
avg train score= 0.882, avg valid score=0.834


In [300]:
pd.DataFrame(data=np.c_[list(range(892,1310)),np.round(result_gbt_s)],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220522_ss_gbt.txt',index=False,header=True)# 得分：0.73444

In [301]:
np.where(np.round(result_gbt_s)!=np.round(result1))

(array([  4,   6,  19,  21,  32,  33,  36,  37,  72,  86,  90, 125, 153,
        158, 159, 165, 169, 181, 192, 197, 199, 225, 242, 245, 249, 252,
        268, 280, 283, 293, 323, 344, 347, 359, 367, 382, 383, 412],
       dtype=int64),)

In [303]:
np.where(np.round(result_rf_s)!=np.round(result_gbt_s))

(array([ 72,  90, 138, 153, 245, 344], dtype=int64),)

In [304]:
np.where(np.round(result_lgb_s)!=np.round(result_gbt_s))

(array([ 18,  21,  64,  87, 138, 200, 202, 245, 273, 323, 345], dtype=int64),)

### xgb

In [150]:
model=XGBClassifier()
params={"learning_rate":0.07,"max_depth":4,"n_estimators":61}
train_pred_xgb,test_pred_xgb,valid_acc_xgb=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.888, valid：0.844
1th fold train：0.868, valid：0.865
2th fold train：0.888, valid：0.809
3th fold train：0.891, valid：0.792
4th fold train：0.865, valid：0.871
avg train score= 0.880, avg valid score=0.836


### perceptron

In [167]:
model=Perceptron()
params={"penalty":'l2',"alpha":1e-5,"max_iter":18,"validation_fraction":0.1,"n_iter_no_change":20,"warm_start":True}
train_pred_pt,test_pred_pt,valid_acc_pt=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.778, valid：0.804
1th fold train：0.798, valid：0.820
2th fold train：0.808, valid：0.758
3th fold train：0.829, valid：0.781
4th fold train：0.805, valid：0.843
avg train score= 0.804, avg valid score=0.801


### sgd

In [177]:
model=SGDClassifier()
params={"loss":'log',"alpha":1e-5,"max_iter":20,"validation_fraction":0.1,"n_iter_no_change":10,"warm_start":True}
train_s_pred_sgd,test_s_pred_sgd,valid_s_acc_sgd=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.779, valid：0.804
1th fold train：0.783, valid：0.803
2th fold train：0.788, valid：0.758
3th fold train：0.764, valid：0.669
4th fold train：0.788, valid：0.826
avg train score= 0.781, avg valid score=0.772


### bayes

In [178]:
model=BernoulliNB()
params={"alpha":1}
train_s_pred_by,test_s_pred_by,valid_s_acc_by=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.788, valid：0.821
1th fold train：0.797, valid：0.798
2th fold train：0.798, valid：0.764
3th fold train：0.809, valid：0.742
4th fold train：0.781, valid：0.837
avg train score= 0.795, avg valid score=0.792


### gp

In [180]:
model=GaussianProcessClassifier()
params={}
train_s_pred_gp,test_s_pred_gp,valid_s_acc_gp=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.963, valid：0.749
1th fold train：0.961, valid：0.747
2th fold train：0.965, valid：0.708
3th fold train：0.959, valid：0.725
4th fold train：0.966, valid：0.798
avg train score= 0.963, avg valid score=0.745


### 平均

In [324]:
result_feature_1=(result_lgb_s+result_xgb_s+result_cat_s\
          +result_gbt_s+result_rf_s+result_lr_s\
         +result_svc_s+result_knn_s+result_pt_s\
          +result_sgd_s+result_gp_s+result_by_s)/12

result_feature_2=(avg_score_lgb_s*result_lgb_s\
        +avg_score_xgb_s*result_xgb_s\
        +avg_score_cat_s*result_cat_s\
        +avg_score_gbt_s*result_gbt_s\
        +avg_score_rf_s*result_rf_s\
        +avg_score_lr_s*result_lr_s\
        +avg_score_svc_s*result_svc_s\
        +avg_score_knn_s*result_knn_s\
        +avg_score_pt_s*result_pt_s\
        +avg_score_sgd_s*result_sgd_s\
        +avg_score_gp_s*result_gp_s\
        +avg_score_by_s*result_by_s)/(avg_score_lgb_s+avg_score_xgb_s+avg_score_cat_s+avg_score_gbt_s+avg_score_rf_s+avg_score_lr_s\
                                      +avg_score_svc_s+avg_score_knn_s+avg_score_pt_s+avg_score_sgd_s+avg_score_gp_s+avg_score_by_s)

In [325]:
np.where(np.round(result_feature_1)!=np.round(result_feature_2))

(array([], dtype=int64),)

### 取前k个最好的

In [None]:
gbt 0.850   0.842
lgb 0.8485  0.848
cat 0.846   0.847
rf 0.845    0.841
xgb 0.840   0.846
gp 0.836    0.765
lr 0.835    0.825
svc 0.835   0.835
knn 0.833   0.813
bayes 0.808 0.806
sgd 0.802   0.819
perceptron 0.792  0.793

In [471]:
result_1=(np.round(result_lgb_s)+np.round(result_xgb_s)+np.round(result_cat_s)\
          +np.round(result_gbt_s)+np.round(result_rf_s)+np.round(result_lr_s)\
          +np.round(result_svc_s)+np.round(result_knn_s)+np.round(result_pt_s)\
          +np.round(result_sgd_s)+np.round(result_gp_s)+np.round(result_by_s))/12

In [459]:
np.where(np.round(resulto11)!=np.round(resulto22))

(array([359], dtype=int64),)

In [328]:
result111[np.where(np.round(result111)!=np.round(result_9))]

array([0.58333333, 0.58333333, 0.58333333])

In [329]:
result_9[np.where(np.round(result111)!=np.round(result_9))]

array([0.44444444, 0.44444444, 0.44444444])

In [461]:
np.where(np.round(result_lgb_s)!=np.round(resulto22))

(array([ 18,  32,  64,  87,  90, 153, 181, 192, 197, 200, 202, 273, 344,
        345, 367, 382, 383], dtype=int64),)

In [323]:
np.where(np.round(result_gbt_s)!=np.round(result_rf_s))

(array([ 72,  90, 138, 153, 245, 344], dtype=int64),)

In [324]:
np.where(np.round(result_gbt_s)!=np.round(result_xgb_s))

(array([ 34,  36, 245, 268, 273, 316, 323, 331, 412], dtype=int64),)

In [313]:
result111[np.where(np.round(result11)!=np.round(result111))]

array([0.5       , 0.5       , 0.5       , 0.48333333, 0.5       ])

In [297]:
np.where(np.round(result11)!=np.round(result1))

(array([  6,  19,  21,  33,  36,  37, 138, 158, 159, 165, 169, 197, 225,
        242, 249, 252, 268, 280, 283, 293, 323, 347, 367, 382, 412],
       dtype=int64),)

In [298]:
result11[np.where(np.round(result11)!=np.round(result1))]

array([0.18333333, 0.33333333, 0.55      , 0.36666667, 0.3       ,
       0.21666667, 0.43333333, 0.85      , 0.46666667, 0.28333333,
       0.18333333, 0.5       , 0.28333333, 0.21666667, 0.23333333,
       0.2       , 0.3       , 0.21666667, 0.28333333, 0.4       ,
       0.51666667, 0.33333333, 0.46666667, 0.5       , 0.4       ])

In [299]:
result1[np.where(np.round(result11)!=np.round(result1))]

array([0.52840008, 0.52437674, 0.45679954, 0.56448001, 0.55540819,
       0.55393837, 0.56096024, 0.43054273, 0.67248507, 0.59836365,
       0.55539291, 0.65666669, 0.58359148, 0.54862118, 0.58336977,
       0.52958313, 0.55540819, 0.52283612, 0.62816882, 0.52198703,
       0.36962838, 0.54255157, 0.68228422, 0.71778027, 0.52814656])

In [137]:
result11[np.where(np.round(result11)!=np.round(result1))]

array([0.18333333, 0.25      , 0.31666667, 0.25      , 0.25      ,
       0.43333333, 0.86666667, 0.5       , 0.25      , 0.23333333,
       0.45      , 0.25      , 0.18333333, 0.2       , 0.18333333,
       0.25      , 0.2       , 0.25      , 0.4       , 0.36666667,
       0.36666667])

In [138]:
result1[np.where(np.round(result11)!=np.round(result1))]

array([0.52840008, 0.52437674, 0.56448001, 0.55540819, 0.55393837,
       0.56096024, 0.43054273, 0.67248507, 0.59836365, 0.55539291,
       0.65666669, 0.58359148, 0.54862118, 0.58336977, 0.52958313,
       0.55540819, 0.52283612, 0.62816882, 0.52198703, 0.54255157,
       0.52814656])

In [135]:
np.where(np.round(result22)!=np.round(result1))

(array([  6,  19,  33,  36,  37, 138, 158, 159, 165, 169, 197, 225, 242,
        249, 252, 268, 280, 283, 293, 347, 412], dtype=int64),)

In [472]:
pd.DataFrame(data=np.c_[list(range(892,1310)),np.round(result_1)],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220522_featurePclassSex_avg.txt',index=False,header=True)# 得分：0.73444