In [44]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,KFold

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)


In [45]:
train=pd.read_csv("./data/train.csv")#(891,12)
test=pd.read_csv("./data/test.csv")#(418,11)
train['is_train']=1
test['is_train']=0
data=pd.concat([train,test])
data_bak=data.copy()

# 数据预处理
## 数据预处理

In [46]:
data['Fare']=data['Fare'].fillna(data['Fare'].median())# 有一个空值，中位数填充

data['Embarked']=data['Embarked'].fillna("S")# 就2个缺失值，填充了众数

# Cabin只有204个非空值
data['Cabins']=data['Cabin'].apply(lambda x:2 if len(str(x))>4 else(1 if pd.notna(x) else 0) )
data['Cabin']=data['Cabin'].str[:1]
data['Cabin']=data['Cabin'].fillna("Z")

# ticket比较复杂，先算了
# train['Ticket'].apply(lambda x:x.split(" ")[0]).value_counts()

data['Title']=data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt','Don','Col','Dr','Rev', 'Major', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace('Mlle', 'Miss')
data['Title'] = data['Title'].replace('Ms', 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

data['familyNum']=data['SibSp']+data['Parch']
data['has_family']=data['familyNum'].apply(lambda x:1 if x>0 else 0)

data=data.drop(['Name','Ticket','PassengerId'],axis=1)

In [55]:
# ohe = OneHotEncoder(handle_unknown='ignore')

# ohe.fit(train[['Cabin','Embarked','Title','Sex']])#训练规则
# feature_names=ohe.get_feature_names(['Cabin','Embarked','Title','Sex'])#获取编码后的特征名
# data_train_onehot=pd.DataFrame(ohe.transform(train[['Cabin','Embarked','Title','Sex']]).toarray(),columns=feature_names)#应用规则在训练集上
 
# data_new_onehot=pd.DataFrame(ohe.transform(test[['Cabin','Embarked','Title','Sex']]).toarray(),columns=feature_names)#应用规则在预测集上

In [47]:
for col in ['Cabin','Embarked','Title','Sex']:
    df=pd.get_dummies(data[col],sparse=True)
    for c in df.columns:
        df.rename(columns={c:col+c},inplace=True)
    data=pd.concat([data.drop([col],axis=1),df],axis=1)
# data=data.drop(['CabinA','CabinB','CabinF','CabinG','CabinT','TitleRare'],axis=1)

## 预测测试集是否生存，用来填充缺失值

In [48]:
X_train=np.array(data[data['is_train']==1].drop(['Survived','is_train'],axis=1))
y_train=np.array(data[data['is_train']==1]['Survived'])
X_test=np.array(data[data['is_train']==0].drop(['Survived','is_train'],axis=1))

In [49]:
from lightgbm import LGBMClassifier

best_model=None
folds=5
avg_score=0
best_score=0
result=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=LGBMClassifier(learning_rate=0.07,max_depth=32,n_estimators=70)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score+=score
    result+=lgb.predict(X_test)
avg_score/=folds
result/=folds

print("avg score= ",avg_score)

0 th fold： 0.8044692737430168
1 th fold： 0.7696629213483146
2 th fold： 0.8033707865168539
3 th fold： 0.9101123595505618
4 th fold： 0.8539325842696629
avg score=  0.8283095850856821


## age缺失值填充，用lgb预测

In [50]:
X_test_pred=best_model.predict(X_test)
data.iloc[891:,0]=X_test_pred
data=data.reset_index(drop=True)

In [51]:
data_age=data[data['Age'].notnull()]
data_noAge=data[data['Age'].isnull()]

X_train_age=np.array(data_age.drop(['is_train','Age'],axis=1))
y_train_age=np.array(data_age['Age'])
X_test_age=np.array(data_noAge.drop(['is_train','Age'],axis=1))

In [52]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

best_model=None
folds=5
avg_score=0
best_score=0

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train_age)):
    lgb=LGBMRegressor(learning_rate=0.5,max_depth=5,n_estimators=7)
    lgb.fit(X_train_age[train_index],y_train_age[train_index])
    
    
    pred=lgb.predict(X_train_age[train_index])
    score0=mean_squared_error(pred,y_train_age[train_index])
    print(i,'th fold 0：',score0)
    
    pred=lgb.predict(X_train_age[test_index])
    score=mean_squared_error(pred,y_train_age[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score+=score
avg_score/=folds

print("avg score= ",avg_score)

0 th fold 0： 92.07688785077193
0 th fold： 116.11721678928332
1 th fold 0： 92.07857399680114
1 th fold： 125.98550188876987
2 th fold 0： 93.60144946658201
2 th fold： 123.24232168543234
3 th fold 0： 93.33004877316607
3 th fold： 115.24514689872464
4 th fold 0： 96.41907057424133
4 th fold： 95.65952433752226
avg score=  115.24994231994648


In [53]:
age_pred=best_model.predict(X_test_age)
data.iloc[data[data['Age'].isnull()].index,2]=age_pred

# 模型训练

In [54]:
X_train=np.array(data[data['is_train']==1].drop(['Survived','is_train'],axis=1))
y_train=np.array(data[data['is_train']==1]['Survived'])
X_test=np.array(data[data['is_train']==0].drop(['Survived','is_train'],axis=1))

## lgb

In [58]:
from lightgbm import LGBMClassifier

best_model=None
folds=5
best_score=0
avg_score_lgb=0
result_lgb=np.zeros(418)
train_result_lgb=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=LGBMClassifier(learning_rate=0.06,max_depth=32,n_estimators=70)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
        
    avg_score_lgb+=score
    result_lgb+=lgb.predict_proba(X_test)[:,1]
    train_result_lgb[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_lgb/=folds
result_lgb/=folds

print("avg score= ",avg_score_lgb)

0 th fold： 0.8379888268156425
1 th fold： 0.7808988764044944
2 th fold： 0.8089887640449438
3 th fold： 0.9101123595505618
4 th fold： 0.8651685393258427
avg score=  0.8406314732282969


## xgb

In [60]:
from xgboost import XGBClassifier

best_model=None
folds=5
best_score=0
avg_score_xgb=0
result_xgb=np.zeros(418)
train_result_xgb=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=XGBClassifier(learning_rate=0.07,max_depth=32,n_estimators=70)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_xgb+=score
    result_xgb+=lgb.predict_proba(X_test)[:,1]
    train_result_xgb[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_xgb/=folds
result_xgb/=5

print("avg score= ",avg_score_xgb)

0 th fold： 0.8491620111731844
1 th fold： 0.8089887640449438
2 th fold： 0.8033707865168539
3 th fold： 0.898876404494382
4 th fold： 0.8426966292134831
avg score=  0.8406189190885694


## catboost

In [61]:
from catboost import CatBoostClassifier
best_model=None
folds=5
best_score=0
avg_score_cat=0
result_cat=np.zeros(418)
train_result_cat=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=CatBoostClassifier(learning_rate=0.2,n_estimators=70)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_cat+=score
    result_cat+=lgb.predict_proba(X_test)[:,1]
    train_result_cat[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_cat/=folds
result_cat/=5

print("avg score= ",avg_score_cat)

0:	learn: 0.5704193	total: 1.42ms	remaining: 98.1ms
1:	learn: 0.5191577	total: 2.29ms	remaining: 78ms
2:	learn: 0.4731345	total: 3.38ms	remaining: 75.6ms
3:	learn: 0.4352703	total: 4.29ms	remaining: 70.8ms
4:	learn: 0.4137495	total: 5.2ms	remaining: 67.5ms
5:	learn: 0.3938276	total: 6.13ms	remaining: 65.4ms
6:	learn: 0.3774983	total: 7.04ms	remaining: 63.4ms
7:	learn: 0.3672136	total: 7.96ms	remaining: 61.7ms
8:	learn: 0.3589187	total: 8.96ms	remaining: 60.7ms
9:	learn: 0.3551809	total: 9.79ms	remaining: 58.7ms
10:	learn: 0.3477772	total: 10.8ms	remaining: 57.7ms
11:	learn: 0.3416886	total: 11.7ms	remaining: 56.5ms
12:	learn: 0.3408051	total: 12.2ms	remaining: 53.6ms
13:	learn: 0.3396790	total: 13ms	remaining: 52ms
14:	learn: 0.3354288	total: 14ms	remaining: 51.4ms
15:	learn: 0.3290559	total: 14.9ms	remaining: 50.4ms
16:	learn: 0.3274765	total: 15.9ms	remaining: 49.5ms
17:	learn: 0.3225848	total: 16.8ms	remaining: 48.5ms
18:	learn: 0.3197832	total: 17.7ms	remaining: 47.4ms
19:	learn: 0

18:	learn: 0.3429014	total: 17.8ms	remaining: 47.8ms
19:	learn: 0.3376640	total: 18.8ms	remaining: 47ms
20:	learn: 0.3346616	total: 19.8ms	remaining: 46.2ms
21:	learn: 0.3297585	total: 20.7ms	remaining: 45.2ms
22:	learn: 0.3260646	total: 21.7ms	remaining: 44.3ms
23:	learn: 0.3237255	total: 22.7ms	remaining: 43.5ms
24:	learn: 0.3222398	total: 23.6ms	remaining: 42.6ms
25:	learn: 0.3216728	total: 24.6ms	remaining: 41.6ms
26:	learn: 0.3197405	total: 25.6ms	remaining: 40.8ms
27:	learn: 0.3151398	total: 26.5ms	remaining: 39.8ms
28:	learn: 0.3140420	total: 27.5ms	remaining: 38.8ms
29:	learn: 0.3105485	total: 28.5ms	remaining: 37.9ms
30:	learn: 0.3097569	total: 29.5ms	remaining: 37.1ms
31:	learn: 0.3076757	total: 30.4ms	remaining: 36.1ms
32:	learn: 0.3063105	total: 31.4ms	remaining: 35.2ms
33:	learn: 0.3047101	total: 32.3ms	remaining: 34.2ms
34:	learn: 0.3012479	total: 33.2ms	remaining: 33.2ms
35:	learn: 0.2987600	total: 34.2ms	remaining: 32.3ms
36:	learn: 0.2949842	total: 35.1ms	remaining: 31

## gbt

In [62]:
from sklearn.ensemble import GradientBoostingClassifier
best_model=None
folds=5
best_score=0
avg_score_gbt=0
result_gbt=np.zeros(418)
train_result_gbt=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=GradientBoostingClassifier(learning_rate=0.2,n_estimators=150)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_gbt+=score
    result_gbt+=lgb.predict_proba(X_test)[:,1]
    train_result_gbt[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_gbt/=folds
result_gbt/=folds

print("avg score= ",avg_score_gbt)

0 th fold： 0.8547486033519553
1 th fold： 0.7808988764044944
2 th fold： 0.8033707865168539
3 th fold： 0.9269662921348315
4 th fold： 0.8539325842696629
avg score=  0.8439834285355596


## RF

In [63]:
from sklearn.ensemble import RandomForestClassifier
best_model=None
folds=5
best_score=0
avg_score_rf=0
result_rf=np.zeros(418)
train_result_rf=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=RandomForestClassifier(n_estimators=100,max_depth=5,min_samples_split=4,min_samples_leaf=3)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_rf+=score
    result_rf+=lgb.predict_proba(X_test)[:,1]
    train_result_rf[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_rf/=folds
result_rf/=folds

print("avg score= ",avg_score_rf)

0 th fold： 0.8379888268156425
1 th fold： 0.8202247191011236
2 th fold： 0.797752808988764
3 th fold： 0.8820224719101124
4 th fold： 0.8089887640449438
avg score=  0.8293955181721172


## lr

In [64]:
from sklearn.linear_model import LogisticRegression
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_lr=np.zeros(418)
train_result_lr=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=LogisticRegression()
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_lr+=lgb.predict_proba(X_test)[:,1]
    train_result_lr[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_lr/=folds
result_lr/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8324022346368715
1 th fold： 0.7808988764044944
2 th fold： 0.797752808988764
3 th fold： 0.8932584269662921
4 th fold： 0.8146067415730337
avg score=  0.8237838177138912


## SVC

In [75]:
from sklearn.svm import LinearSVC

best_model=None
folds=5
best_score=0
avg_score_svc=0
result_svc=np.zeros(418)
train_result_svc=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=LinearSVC(C=0.09,tol=1e-4,max_iter=1000)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_svc+=score
    result_svc+=lgb.predict(X_test)
    train_result_svc[test_index]=lgb.predict(X_train[test_index])
    
avg_score_svc/=folds
result_svc/=folds

print("avg score= ",avg_score_svc)

0 th fold： 0.7877094972067039
1 th fold： 0.8033707865168539
2 th fold： 0.7921348314606742
3 th fold： 0.8595505617977528
4 th fold： 0.8314606741573034
avg score=  0.8148452702278576


## knn

In [244]:
from sklearn.neighbors import KNeighborsClassifier

best_model=None
folds=5
best_score=0
avg_score_knn=0
result_knn=np.zeros(418)
train_result_knn=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=KNeighborsClassifier(n_neighbors=9,p=1)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_knn+=score
    result_knn+=lgb.predict_proba(X_test)[:,1]
    train_result_knn[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_knn/=folds
result_knn/=folds

print("avg score= ",avg_score_knn)

0 th fold： 0.770949720670391
1 th fold： 0.8033707865168539
2 th fold： 0.7415730337078652
3 th fold： 0.8314606741573034
4 th fold： 0.7921348314606742
avg score=  0.7878978093026175


## perceptron

In [80]:
from sklearn.linear_model import Perceptron

best_model=None
folds=5
best_score=0
avg_score_pt=0
result_pt=np.zeros(418)
train_result_pt=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=Perceptron(penalty='l2',alpha=1e-5,max_iter=10,validation_fraction=0.1,n_iter_no_change=10,warm_start=True)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_pt+=score
    result_pt+=lgb.predict(X_test)
    train_result_pt[test_index]=lgb.predict(X_train[test_index])
    
avg_score_pt/=folds
result_pt/=folds

print("avg score= ",avg_score_pt)

0 th fold： 0.7541899441340782
1 th fold： 0.7865168539325843
2 th fold： 0.7191011235955056
3 th fold： 0.7752808988764045
4 th fold： 0.7584269662921348
avg score=  0.7587031573661415


## SGD

In [91]:
from sklearn.linear_model import SGDClassifier

best_model=None
folds=5
best_score=0
avg_score_sgd=0
result_sgd=np.zeros(418)
train_result_sgd=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=SGDClassifier(loss='log',alpha=1e-5,max_iter=20,validation_fraction=0.1,n_iter_no_change=10,warm_start=True)
#     lgb=SGDClassifier(penalty='l2',alpha=1e-5,max_iter=10,validation_fraction=0.1,n_iter_no_change=10,warm_start=True)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_sgd+=score
    result_sgd+=lgb.predict_proba(X_test)[:,1]
    train_result_sgd[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_sgd/=folds
result_sgd/=folds

print("avg score= ",avg_score_sgd)

0 th fold： 0.6759776536312849
1 th fold： 0.7134831460674157
2 th fold： 0.7584269662921348
3 th fold： 0.8370786516853933
4 th fold： 0.7247191011235955
avg score=  0.7419371037599649


## gp

In [92]:
from sklearn.gaussian_process import GaussianProcessClassifier

best_model=None
folds=5
best_score=0
avg_score_gp=0
result_gp=np.zeros(418)
train_result_gp=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=GaussianProcessClassifier()
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_gp+=score
    result_gp+=lgb.predict_proba(X_test)[:,1]
    train_result_gp[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_gp/=folds
result_gp/=folds

print("avg score= ",avg_score_gp)

0 th fold： 0.8156424581005587
1 th fold： 0.7078651685393258
2 th fold： 0.6910112359550562
3 th fold： 0.7696629213483146
4 th fold： 0.7134831460674157
avg score=  0.739532986002134


##  bayes

In [93]:
from sklearn.naive_bayes import BernoulliNB

best_model=None
folds=5
best_score=0
avg_score_by=0
result_by=np.zeros(418)
train_result_by=np.zeros(891)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=BernoulliNB(alpha=10)
    lgb.fit(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_by+=score
    result_by+=lgb.predict_proba(X_test)[:,1]
    train_result_by[test_index]=lgb.predict_proba(X_train[test_index])[:,1]
    
avg_score_by/=folds
result_by/=folds

print("avg score= ",avg_score_by)

0 th fold： 0.7988826815642458
1 th fold： 0.7921348314606742
2 th fold： 0.7584269662921348
3 th fold： 0.8651685393258427
4 th fold： 0.7696629213483146
avg score=  0.7968551879982424


# 模型融合
## 简单平均

In [36]:
result1=(result_lgb+result_xgb+result_cat+result_gbt+result_rf+result_lr+result_svc+result_knn+result_pt+result_sgd+result_gp+result_by)/12

In [333]:
# result1_round=np.round(result1)
pd.DataFrame(data=np.c_[list(range(892,1310)),result1_round],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220519_avg.txt',index=False,header=True)# 得分：0.73444

## 加权平均

In [319]:
result2=(avg_score_lgb*result_lgb\
        +avg_score_xgb*result_xgb\
        +avg_score_cat*result_cat\
        +avg_score_gbt*result_gbt\
        +avg_score_rf*result_rf\
        +avg_score_lr*result_lr\
        +avg_score_svc*result_svc\
        +avg_score_knn*result_knn\
        +avg_score_pt*result_pt\
        +avg_score_sgd*result_sgd\
        +avg_score_gp*result_gp\
        +avg_score_by*result_by)/(avg_score_lgb+avg_score_xgb+avg_score_cat+avg_score_gbt+avg_score_rf+avg_score_lr+avg_score_svc\
                                 +avg_score_knn+avg_score_pt+avg_score_sgd+avg_score_gp+avg_score_by)

In [328]:
np.where(result1_round!=result2_round)# 简单平均和加权平均结果一样

(array([], dtype=int64),)

## stacking

In [94]:
train_stacking=np.c_[train_result_lgb,train_result_xgb,train_result_cat,train_result_gbt,train_result_rf,train_result_lr,train_result_svc,\
      train_result_knn,train_result_pt,train_result_sgd,train_result_gp,train_result_by]
y_train_stacking=y_train.copy()
test_stacking=np.c_[result_lgb,result_xgb,result_cat,result_gbt,result_rf,result_lr,result_svc,\
      result_knn,result_pt,result_sgd,result_gp,result_by]

In [None]:
LogisticRegression
RandomForestClassifier

### rf

In [168]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_rf_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=RandomForestClassifier(n_estimators=100,max_depth=8,min_samples_split=4,min_samples_leaf=3)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_rf_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_rf_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8268156424581006
1 th fold： 0.7921348314606742
2 th fold： 0.8202247191011236
3 th fold： 0.9157303370786517
4 th fold： 0.8426966292134831
avg score=  0.8395204318624065


In [112]:
np.where(np.round(result_rf_s)!=np.round(result1))

(array([  4,  21,  32,  33,  34,  36,  37,  41,  73,  75,  87,  98, 125,
        138, 144, 148, 153, 159, 165, 197, 199, 202, 242, 249, 252, 268,
        283, 293, 316, 323, 359, 367, 379, 382], dtype=int64),)

In [167]:
pd.DataFrame(data=np.c_[list(range(892,1310)),result_lr_s],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220520_rf_s.txt',index=False,header=True)# 得分：0.73444

### lr

In [153]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_lr_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=LogisticRegression()
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_lr_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_lr_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8547486033519553
1 th fold： 0.7921348314606742
2 th fold： 0.8089887640449438
3 th fold： 0.9157303370786517
4 th fold： 0.848314606741573
avg score=  0.8439834285355596


In [109]:
np.where(np.round(result_lr_s)!=np.round(result1))

(array([ 34,  36,  37,  41,  75, 138, 144, 148, 165, 197, 199, 242, 244,
        249, 252, 268, 316, 323, 367, 405], dtype=int64),)

In [114]:
np.where(np.round(result_lr_s)!=np.round(result_rf_s))

(array([  4,  21,  32,  33,  73,  87,  98, 125, 153, 159, 202, 244, 283,
        293, 359, 379, 382, 405], dtype=int64),)

### lgb

In [169]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_lgb_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=LGBMClassifier(learning_rate=0.009,max_depth=13,n_estimators=70)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_lgb_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_lgb_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8212290502793296
1 th fold： 0.7921348314606742
2 th fold： 0.8202247191011236
3 th fold： 0.9101123595505618
4 th fold： 0.8595505617977528
avg score=  0.8406503044378884


### cat

In [179]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_lgb_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=CatBoostClassifier(learning_rate=0.1,n_estimators=70)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_lgb_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_lgb_s/=folds

print("avg score= ",avg_score_lr)

0:	learn: 0.6465073	total: 1.1ms	remaining: 76ms
1:	learn: 0.6080789	total: 2.1ms	remaining: 71.5ms
2:	learn: 0.5759160	total: 2.9ms	remaining: 64.7ms
3:	learn: 0.5463286	total: 3.73ms	remaining: 61.5ms
4:	learn: 0.5222416	total: 4.49ms	remaining: 58.4ms
5:	learn: 0.5022131	total: 5.27ms	remaining: 56.2ms
6:	learn: 0.4855599	total: 6.03ms	remaining: 54.3ms
7:	learn: 0.4703763	total: 6.82ms	remaining: 52.9ms
8:	learn: 0.4565108	total: 7.74ms	remaining: 52.4ms
9:	learn: 0.4442994	total: 8.6ms	remaining: 51.6ms
10:	learn: 0.4341236	total: 9.39ms	remaining: 50.3ms
11:	learn: 0.4249654	total: 10.2ms	remaining: 49.2ms
12:	learn: 0.4166309	total: 11ms	remaining: 48.3ms
13:	learn: 0.4089197	total: 11.8ms	remaining: 47.4ms
14:	learn: 0.4029824	total: 12.6ms	remaining: 46.3ms
15:	learn: 0.3972800	total: 13.4ms	remaining: 45.2ms
16:	learn: 0.3922267	total: 14.2ms	remaining: 44.2ms
17:	learn: 0.3870358	total: 14.9ms	remaining: 43.1ms
18:	learn: 0.3828713	total: 15.7ms	remaining: 42.1ms
19:	learn: 

0:	learn: 0.6479801	total: 7.95ms	remaining: 549ms
1:	learn: 0.6094118	total: 8.94ms	remaining: 304ms
2:	learn: 0.5770288	total: 9.79ms	remaining: 219ms
3:	learn: 0.5478171	total: 10.6ms	remaining: 175ms
4:	learn: 0.5234638	total: 11.5ms	remaining: 149ms
5:	learn: 0.5028622	total: 12.2ms	remaining: 131ms
6:	learn: 0.4855186	total: 13ms	remaining: 117ms
7:	learn: 0.4705139	total: 13.8ms	remaining: 107ms
8:	learn: 0.4578536	total: 14.6ms	remaining: 98.7ms
9:	learn: 0.4452423	total: 15.3ms	remaining: 92ms
10:	learn: 0.4345067	total: 16.1ms	remaining: 86.1ms
11:	learn: 0.4248581	total: 16.8ms	remaining: 81.2ms
12:	learn: 0.4171389	total: 17.6ms	remaining: 77ms
13:	learn: 0.4099271	total: 18.3ms	remaining: 73.3ms
14:	learn: 0.4031764	total: 19.2ms	remaining: 70.4ms
15:	learn: 0.3970966	total: 20ms	remaining: 67.5ms
16:	learn: 0.3917714	total: 20.8ms	remaining: 64.8ms
17:	learn: 0.3865804	total: 21.5ms	remaining: 62.3ms
18:	learn: 0.3822826	total: 22.3ms	remaining: 59.9ms
19:	learn: 0.378469

### svc

In [187]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_svc_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=LinearSVC(C=0.09,tol=1e-5,max_iter=100)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_svc_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_svc_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8379888268156425
1 th fold： 0.7921348314606742
2 th fold： 0.8089887640449438
3 th fold： 0.9157303370786517
4 th fold： 0.8539325842696629
avg score=  0.8417550687339149


### knn

In [251]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_knn_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=KNeighborsClassifier(n_neighbors=34,p=1)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_knn_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_knn_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8491620111731844
1 th fold： 0.797752808988764
2 th fold： 0.8202247191011236
3 th fold： 0.9044943820224719
4 th fold： 0.8539325842696629
avg score=  0.8451133011110414


### gbt

In [265]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_gbt_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=GradientBoostingClassifier(learning_rate=0.1,n_estimators=65)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_gbt_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_gbt_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8324022346368715
1 th fold： 0.7640449438202247
2 th fold： 0.8202247191011236
3 th fold： 0.9157303370786517
4 th fold： 0.8764044943820225
avg score=  0.8417613458037788


### xgb

In [283]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_xbt_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=XGBClassifier(learning_rate=0.1,max_depth=1,n_estimators=70)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_xbt_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_xbt_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8659217877094972
1 th fold： 0.7808988764044944
2 th fold： 0.8202247191011236
3 th fold： 0.9157303370786517
4 th fold： 0.8539325842696629
avg score=  0.8473416609126859


### perceptron

In [302]:
best_model=None
folds=5
best_score=0
avg_score_lr=0
result_pt_s=np.zeros(418)

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(train_stacking)):
    lgb=Perceptron(penalty='l2',alpha=1e-3,max_iter=10,validation_fraction=0.1,n_iter_no_change=10,warm_start=True)
    lgb.fit(train_stacking[train_index],y_train_stacking[train_index])
    score=lgb.score(train_stacking[test_index],y_train_stacking[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score_lr+=score
    result_pt_s+=lgb.predict(test_stacking)
    
avg_score_lr/=folds
result_pt_s/=folds

print("avg score= ",avg_score_lr)

0 th fold： 0.8268156424581006
1 th fold： 0.7584269662921348
2 th fold： 0.797752808988764
3 th fold： 0.9157303370786517
4 th fold： 0.8089887640449438
avg score=  0.8215429037725188


### sgd

In [None]:
SGDClassifier(loss='log',alpha=1e-5,max_iter=20,validation_fraction=0.1,n_iter_no_change=10,warm_start=True)