# 概览

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold

# 模型的包
from sklearn.linear_model import Perceptron,SGDClassifier,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import BernoulliNB

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',None)

In [64]:
train=pd.read_csv("./data/train.csv")#(891,12)
test=pd.read_csv("./data/test.csv")#(418,11)
train['is_train']=1
test['is_train']=0
data=pd.concat([train,test],ignore_index=True)

# 属性分析

## 性别
### 只区分男女(仅分析)

In [10]:
sex_data=test[['PassengerId','Sex']]
sex_data['Survived']=sex_data['Sex'].apply(lambda x:1 if x=='female' else 0)

In [13]:
# 0.76555，只使用性别一个特征，这么简单粗暴，都能有这么高的准确率
sex_data[['PassengerId','Survived']].to_csv('result_20220611_sexModel.txt',index=False,header=True)

In [28]:
train[(train['Sex']=='female')&(train['Survived']==0)].head(10)# 死亡的female几乎都是低Pclass

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S,1
18,19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S,1
24,25,0,3,"Palsson, Miss. Torborg Danira",female,8.0,3,1,349909,21.075,,S,1
38,39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,,S,1
40,41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40.0,1,0,7546,9.475,,S,1
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S,1
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S,1
71,72,0,3,"Goodwin, Miss. Lillian Amy",female,16.0,5,2,CA 2144,46.9,,S,1
100,101,0,3,"Petranec, Miss. Matilda",female,28.0,0,0,349245,7.8958,,S,1
111,112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C,1


In [26]:
train.query("Sex=='male'&Survived==1")# 男性的存活好像更难预测一点

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,1
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S,1
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,1
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C,1
55,56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S,1
65,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C,1
74,75,1,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S,1
81,82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29.0,0,0,345779,9.5,,S,1
97,98,1,1,"Greenfield, Mr. William Bertram",male,23.0,0,1,PC 17759,63.3583,D10 D12,C,1


In [27]:
train.query("Sex=='male'&Age<=18")# 15岁以下的男孩确实都叫master

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,1
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q,1
50,51,0,3,"Panula, Master. Juha Niilo",male,7.0,4,1,3101295,39.6875,,S,1
59,60,0,3,"Goodwin, Master. William Frederick",male,11.0,5,2,CA 2144,46.9,,S,1
63,64,0,3,"Skoog, Master. Harald",male,4.0,3,2,347088,27.9,,S,1
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0,,S,1
86,87,0,3,"Ford, Mr. William Neal",male,16.0,1,3,W./C. 6608,34.375,,S,1
125,126,1,3,"Nicola-Yarred, Master. Elias",male,12.0,1,0,2651,11.2417,,C,1
138,139,0,3,"Osen, Mr. Olaf Elon",male,16.0,0,0,7534,9.2167,,S,1
144,145,0,2,"Andrew, Mr. Edgardo Samuel",male,18.0,0,0,231945,11.5,,S,1


### 区分男孩

In [65]:
data['Title']=data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data.loc[data['Title']=='Master','Sex']='boy'

## 家族
### 不考虑保姆

In [66]:
data['Surname']=data['Name'].apply(lambda x:x.split(',')[0])

data["Ticket_id"]=(data['Pclass'].astype(str) + "-" + data['Ticket'].str[:-1].astype(str) + "-" + data['Fare'].astype(str) + "-" + data['Embarked'].astype(str) )
data["Group_id"] = data['Surname'] + "-" + data["Ticket_id"]

# 后续处理
data.loc[data['Sex'] == "male", "Group_id"] = "noGroup"
data["WC_count"] = (data.loc[data['Sex'] != "male"].groupby("Group_id")["Group_id"].transform("count"))
data.loc[data['WC_count'] <= 1, "Group_id"] = "noGroup"

data["WCSurvived"] = data.loc[data['Group_id'] != "noGroup"].groupby("Group_id").Survived.transform("mean")

In [75]:
data.loc[data['Sex'] != "male"].groupby("Group_id")["Group_id"]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002C4C318CB80>

In [72]:
data.loc[data['Sex'] != "male"].groupby("Group_id")["Group_id"].transform("count")

1       306
2       306
3       306
7         5
8         3
9       306
10        3
11      306
14      306
15      306
16        6
18        2
19      306
22      306
24        5
25        6
28      306
31      306
32      306
38        2
39        2
40      306
41      306
43        3
44      306
47      306
49      306
50        4
52      306
53      306
56      306
58        3
59        6
61      306
63        5
65        3
66      306
68      306
71        6
78        2
79      306
82      306
84      306
85      306
88        4
98        2
100     306
106     306
109     306
111       2
113       2
114     306
119       6
123     306
125       2
128       3
132     306
133     306
136     306
140       3
141     306
142     306
147       3
151     306
156     306
159       7
161       2
164       4
165       2
166     306
167       5
171       6
172       3
176       5
177     306
180       7
182       6
183       4
184       2
186     306
190     306
192     306
193       2
194 

In [67]:
# female组内生存率为0时判断死亡
# boy组内生存率<=0.5时判断死亡
data.loc[(data['is_train']==0)&(data['Group_id']!="noGroup")&(data['Sex']=='female')&(data['WCSurvived']==0.00),'Survived']=0
data.loc[(data['is_train']==0)&(data['Group_id']!="noGroup")&(data['Sex']=='female')&(data['WCSurvived']!=0.00)&(data['WCSurvived'].notnull()),'Survived']=1
data.loc[(data['is_train']==0)&(data['Group_id']!="noGroup")&(data['Sex']=='boy')&((data['Pclass']!=3)|(data['WCSurvived']>0.5)),'Survived']=1
data.loc[(data['is_train']==0)&(data['Group_id']!="noGroup")&(data['Sex']=='boy')&(data['Pclass']==3)&(data['WCSurvived']<=0.5),'Survived']=0

In [68]:
# 如果家族人员全部在测试集里呢
groups_test = set(data[data['is_train']==0]['Group_id'].unique()) - set(data[data['is_train']==1]['Group_id'].unique())
# data.loc[data['Group_id'].isin(groups_test)].sort_values(by="Surname")
data.loc[data['Group_id'].isin(groups_test),'Survived']=1
data.loc[data['Group_id']=='van Billiard-3-A/5. 85-14.5-S','Survived']=0

In [69]:
data.loc[(data['Group_id']=='noGroup')&(data['Sex'].isin(['male','boy'])),'Survived']=0
data.loc[(data['Group_id']=='noGroup')&(data['Sex']=='female'),'Survived']=1

In [240]:
data.loc[(data['is_train']==0)&(data['Group_id']=='noGroup'),'PassengerId']

891      892
892      893
893      894
894      895
896      897
897      898
898      899
899      900
900      901
901      902
902      903
903      904
904      905
905      906
906      907
907      908
908      909
910      911
911      912
912      913
913      914
914      915
916      917
917      918
918      919
919      920
920      921
921      922
922      923
925      926
926      927
927      928
929      930
930      931
931      932
932      933
933      934
934      935
935      936
936      937
937      938
938      939
939      940
941      942
942      943
945      946
947      948
948      949
949      950
950      951
951      952
952      953
953      954
954      955
956      957
957      958
958      959
959      960
961      962
962      963
963      964
964      965
965      966
966      967
967      968
968      969
969      970
970      971
972      973
973      974
974      975
975      976
976      977
977      978
978      979
979      980
981      982

In [70]:
# 把两个保姆设置为死亡
data.loc[data['PassengerId'].isin([1172,1259]),'Survived']=0
# 更改保姆的group_id
data.loc[data['PassengerId'].isin([1172,1259]),"Group_id"] = data.loc[data['PassengerId'].isin([1172,1259]),'Surname'] + "-" + data.loc[data['PassengerId'].isin([1172,1259]),"Ticket_id"]

data.loc[data['PassengerId'].isin([1172,1259])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train,Title,Surname,Ticket_id,Group_id,WC_count,WCSurvived
1171,1172,0.0,3,"Oreskovic, Miss. Jelka",female,23.0,0,0,315085,8.6625,,S,0,Miss,Oreskovic,3-31508-8.6625-S,Oreskovic-3-31508-8.6625-S,1.0,
1258,1259,0.0,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22.0,0,0,3101295,39.6875,,S,0,Miss,Riihivouri,3-310129-39.6875-S,Riihivouri-3-310129-39.6875-S,1.0,


In [306]:
# 别运行
data['Survived']=data['Survived'].astype('int64')
data.loc[data['is_train']==0,["PassengerId","Survived"]].to_csv('result_20220616_WCGModel_3.txt',index=False,header=True)

### 考虑保姆(仅分析)

In [424]:
single_women_ticketId=set(data[(data['Sex']=='female')&(data['Group_id']=='noGroup')&(data['SibSp']+data['Parch']==0)]['Ticket_id'])
group_ticketId=set(data[data['Group_id']!='noGroup']['Ticket_id'])
babysitter_ticketId=group_ticketId.intersection(single_women_ticketId)

In [425]:
data[(data['Sex']=='female')&(data['Group_id']=='noGroup')&(data['Ticket_id'].isin(babysitter_ticketId))].sort_values(by='Ticket_id')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train,Title,Surname,Ticket_id,Group_id,WC_count,WCSurvived
708,709,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S,1,Miss,Cleaver,1-11378-151.55-S,noGroup,1.0,
1032,1033,1,1,"Daniels, Miss. Sarah",female,33.0,0,0,113781,151.55,,S,0,Miss,Daniels,1-11378-151.55-S,noGroup,1.0,
337,338,1,1,"Burns, Miss. Elizabeth Margaret",female,41.0,0,0,16966,134.5,E40,C,1,Miss,Burns,1-1696-134.5-C,noGroup,1.0,
1262,1263,1,1,"Wilson, Miss. Helen Alice",female,31.0,0,0,16966,134.5,E39 E41,C,0,Miss,Wilson,1-1696-134.5-C,noGroup,1.0,
1291,1292,1,1,"Bonnell, Miss. Caroline",female,30.0,0,0,36928,164.8667,C7,S,0,Miss,Bonnell,1-3692-164.8667-S,noGroup,1.0,
950,951,1,1,"Chaudanson, Miss. Victorine",female,36.0,0,0,PC 17608,262.375,B61,C,0,Miss,Chaudanson,1-PC 1760-262.375-C,noGroup,1.0,
1266,1267,1,1,"Bowen, Miss. Grace Scott",female,45.0,0,0,PC 17608,262.375,,C,0,Miss,Bowen,1-PC 1760-262.375-C,noGroup,1.0,
1067,1068,1,2,"Sincock, Miss. Maude",female,20.0,0,0,C.A. 33112,36.75,,S,0,Miss,Sincock,2-C.A. 3311-36.75-S,noGroup,1.0,
1258,1259,1,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22.0,0,0,3101295,39.6875,,S,0,Miss,Riihivouri,3-310129-39.6875-S,noGroup,1.0,
1171,1172,1,3,"Oreskovic, Miss. Jelka",female,23.0,0,0,315085,8.6625,,S,0,Miss,Oreskovic,3-31508-8.6625-S,noGroup,1.0,


In [410]:
data[(data['Sex']!='male')&(data['Ticket_id'].isin(babysitter_ticketId))].sort_values(by=['Ticket_id','Surname','is_train'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train,Title,Surname,Ticket_id,Group_id,WC_count,WCSurvived
297,298,0.0,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,1,Miss,Allison,1-113-151.55-S,Allison-1-113-151.55-S,3.0,0.333333
305,306,1.0,1,"Allison, Master. Hudson Trevor",boy,0.92,1,2,113781,151.55,C22 C26,S,1,Master,Allison,1-113-151.55-S,Allison-1-113-151.55-S,3.0,0.333333
498,499,0.0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,1,Mrs,Allison,1-113-151.55-S,Allison-1-113-151.55-S,3.0,0.333333
708,709,1.0,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.55,,S,1,Miss,Cleaver,1-113-151.55-S,noGroup,1.0,
1032,1033,1.0,1,"Daniels, Miss. Sarah",female,33.0,0,0,113781,151.55,,S,0,Miss,Daniels,1-113-151.55-S,noGroup,1.0,
337,338,1.0,1,"Burns, Miss. Elizabeth Margaret",female,41.0,0,0,16966,134.5,E40,C,1,Miss,Burns,1-16-134.5-C,noGroup,1.0,
1087,1088,1.0,1,"Spedden, Master. Robert Douglas",boy,6.0,0,2,16966,134.5,E34,C,0,Master,Spedden,1-16-134.5-C,Spedden-1-16-134.5-C,2.0,1.0
319,320,1.0,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,E34,C,1,Mrs,Spedden,1-16-134.5-C,Spedden-1-16-134.5-C,2.0,1.0
1262,1263,1.0,1,"Wilson, Miss. Helen Alice",female,31.0,0,0,16966,134.5,E39 E41,C,0,Miss,Wilson,1-16-134.5-C,noGroup,1.0,
1291,1292,1.0,1,"Bonnell, Miss. Caroline",female,30.0,0,0,36928,164.8667,C7,S,0,Miss,Bonnell,1-36-164.8667-S,noGroup,1.0,


# 非WCG正常流程

## 数据预处理

In [25]:
def getAgeRange(x):
    if 0<=x<16:return 1
    elif 16<=x<40:return 2
    else: return 3
data['Age_range']=data['Age'].apply(getAgeRange)

data['SexPclass']=data['Sex'].str.cat(data['Pclass'].map(lambda x:str(x)))

In [26]:
data['Fare']=data['Fare'].fillna(data['Fare'].median())# 有一个空值，中位数填充

data['Embarked']=data['Embarked'].fillna("S")# 就2个缺失值，填充了众数

# Cabin只有204个非空值
data['Cabins']=data['Cabin'].apply(lambda x:2 if len(str(x))>4 else(1 if pd.notna(x) else 0) )
data['Cabin']=data['Cabin'].str[:1]
data['Cabin']=data['Cabin'].fillna("Z")

# ticket比较复杂，先算了
# train['Ticket'].apply(lambda x:x.split(" ")[0]).value_counts()

data['Title']=data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt','Don','Col','Dr','Rev', 'Major', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace('Mlle', 'Miss')
data['Title'] = data['Title'].replace('Ms', 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')

data['familyNum']=data['SibSp']+data['Parch']
data['has_family']=data['familyNum'].apply(lambda x:1 if x>0 else 0)

data=data.drop(['Name','Ticket'],axis=1)

In [30]:
for col in ['Cabin','Embarked','Title','Sex','Age_range']:
    df=pd.get_dummies(data[col],sparse=True)
    for c in df.columns:
        df.rename(columns={c:col+str(c)},inplace=True)
    if col!='Sex':
        data=pd.concat([data.drop([col],axis=1),df],axis=1)
    
df=pd.get_dummies(data['SexPclass'],sparse=True)
data=pd.concat([data.drop(['SexPclass'],axis=1),df],axis=1)

# data=data.drop(['CabinA','CabinD','CabinF','CabinG','CabinT','Sexmale'],axis=1)

In [28]:
data=data.drop(['Surname','Ticket_id'],axis=1)

### 预测测试集是否生存，用来填充缺失值

In [31]:
X_train=np.array(data[(data['is_train']==1)&(data['Sex']=='male')].drop(['Survived','is_train','Group_id','PassengerId','boy1','boy2','boy3','female1','female2','female3','Sex'],axis=1))
y_train=np.array(data[(data['is_train']==1)&(data['Sex']=='male')]['Survived'])
X_test=np.array(data[(data['is_train']==0)&(data['Sex']=='male')].drop(['Survived','is_train','Group_id','PassengerId','boy1','boy2','boy3','female1','female2','female3','Sex'],axis=1))

In [34]:
from lightgbm import LGBMClassifier

best_model=None
folds=5
avg_score=0
avg_ccore_train=0
best_score=0
result=np.zeros(245)# 245是测试集男性数量，418是测试集数量

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train)):
    lgb=LGBMClassifier(learning_rate=0.07,max_depth=20,n_estimators=70)
    lgb.fit(X_train[train_index],y_train[train_index])
    
    score_train=lgb.score(X_train[train_index],y_train[train_index])
    score=lgb.score(X_train[test_index],y_train[test_index])
    print('{}th fold train：{:.3f}, test：{:.3f}'.format(i,score_train,score))
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score+=score
    avg_ccore_train+=score_train
    result+=lgb.predict(X_test)
avg_score/=folds
avg_ccore_train/=folds
result/=folds

print("avg train score= {:.3f}, avg score={:.3f}".format(avg_ccore_train,avg_score))

0th fold train：1.000, test：1.000
1th fold train：1.000, test：1.000
2th fold train：1.000, test：1.000
3th fold train：1.000, test：1.000
4th fold train：1.000, test：1.000
avg train score= 1.000, avg score=1.000


### age缺失值填充
#### 用lgb预测

In [37]:
data_age=data[(data['Age'].notnull())&(data['Sex']=='male')].drop(['Sex'],axis=1)
data_noAge=data[(data['Age'].isnull())&(data['Sex']=='male')].drop(['Sex'],axis=1)

X_train_age=np.array(data_age.drop(['is_train','Age','Group_id'],axis=1))
y_train_age=np.array(data_age['Age'])
X_test_age=np.array(data_noAge.drop(['is_train','Age','Group_id'],axis=1))

In [38]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

best_model=None
folds=5
avg_score=0
best_score=0

kf=KFold(n_splits=folds,shuffle=True,random_state=2022)
for i, (train_index,test_index) in enumerate(kf.split(X_train_age)):
    lgb=LGBMRegressor(learning_rate=0.5,max_depth=5,n_estimators=7)
    lgb.fit(X_train_age[train_index],y_train_age[train_index])
    
    
    pred=lgb.predict(X_train_age[train_index])
    score0=mean_squared_error(pred,y_train_age[train_index])
    print(i,'th fold 0：',score0)
    
    pred=lgb.predict(X_train_age[test_index])
    score=mean_squared_error(pred,y_train_age[test_index])
    print(i,'th fold：',score)
    if best_score<score:
        best_score=score
        best_model=lgb    
    avg_score+=score
avg_score/=folds

print("avg score= ",avg_score)

0 th fold 0： 31.70581316125064
0 th fold： 65.93150284010459
1 th fold 0： 32.790739317523744
1 th fold： 53.54091396135594
2 th fold 0： 35.67452811228919
2 th fold： 47.75250319877265
3 th fold 0： 34.415291439913304
3 th fold： 49.390923942970545
4 th fold 0： 37.20511378054557
4 th fold： 40.10425655256781
avg score=  51.34402009915431


In [39]:
age_pred=best_model.predict(X_test_age)
data.loc[(data['Age'].isnull())&(data['Sex']=='male'),'Age']=age_pred

## 模型训练

In [45]:
# 仅预测非WCG人
X_train=np.array(data[(data['is_train']==1)&(data['Sex']=='male')].drop(['Survived','is_train','Group_id','PassengerId','WC_count','WCSurvived','Sex'],axis=1))# (891, 42)
y_train=np.array(data[(data['is_train']==1)&(data['Sex']=='male')]['Survived'])# (891,)
X_test=np.array(data[(data['is_train']==0)&(data['Sex']=='male')&(data['Group_id']=='noGroup')].drop(['Survived','is_train','Group_id','PassengerId','WC_count','WCSurvived','Sex'],axis=1))# (351, 42)

In [53]:
# 模型函数
def train_model(model,params,X_train,y_train,X_test):
    folds=5
    train_Acc,valid_Acc=0,0
    train_pred,test_pred=np.zeros(537),np.zeros(245)#418所有人。351非wcg.537非wcg训练 891

    kf=StratifiedKFold(n_splits=folds,shuffle=True,random_state=2022)
    model.set_params(**params)
    for i, (train_index,valid_index) in enumerate(kf.split(X_train,y_train)):
        model.fit(X_train[train_index],y_train[train_index])

        train_score=model.score(X_train[train_index],y_train[train_index])
        valid_score=model.score(X_train[valid_index],y_train[valid_index])
        print('{}th fold train：{:.3f}, valid：{:.3f}'.format(i,train_score,valid_score))

        valid_Acc+=valid_score
        train_Acc+=train_score
        try:
            test_pred+=model.predict_proba(X_test)[:,1]
            train_pred[valid_index]=model.predict_proba(X_train[valid_index])[:,1]
        except:
            test_pred+=model.predict(X_test)
            train_pred[valid_index]=model.predict(X_train[valid_index])

    valid_Acc/=folds
    train_Acc/=folds
    test_pred/=folds
    print("avg train score= {:.3f}, avg valid score={:.3f}".format(train_Acc,valid_Acc))
    
    return train_pred,test_pred,valid_Acc

In [54]:
model=LGBMClassifier()
params={"learning_rate":0.07,"max_depth":12,"n_estimators":80}
train_pred_lgb,test_pred_lgb,valid_acc_lgb=train_model(model,params,X_train,y_train,X_test)

0th fold train：1.000, valid：1.000
1th fold train：1.000, valid：1.000
2th fold train：1.000, valid：1.000
3th fold train：1.000, valid：1.000
4th fold train：1.000, valid：1.000
avg train score= 1.000, avg valid score=1.000


In [327]:
model=XGBClassifier()
params={"learning_rate":0.05,"max_depth":16,"n_estimators":73}
train_pred_xgb,test_pred_xgb,valid_acc_xgb=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.994, valid：0.966
1th fold train：0.994, valid：0.961
2th fold train：0.992, valid：0.966
3th fold train：0.992, valid：0.966
4th fold train：0.992, valid：0.983
avg train score= 0.993, avg valid score=0.969


In [328]:
model=CatBoostClassifier()
params={"learning_rate":0.08,"n_estimators":82}
train_pred_cat,test_pred_cat,valid_acc_cat=train_model(model,params,X_train,y_train,X_test)

0:	learn: 0.5432644	total: 165ms	remaining: 13.4s
1:	learn: 0.4244849	total: 166ms	remaining: 6.65s
2:	learn: 0.3403129	total: 167ms	remaining: 4.4s
3:	learn: 0.2735731	total: 168ms	remaining: 3.28s
4:	learn: 0.2403160	total: 169ms	remaining: 2.61s
5:	learn: 0.2086007	total: 170ms	remaining: 2.16s
6:	learn: 0.1800217	total: 172ms	remaining: 1.84s
7:	learn: 0.1635358	total: 173ms	remaining: 1.6s
8:	learn: 0.1440216	total: 174ms	remaining: 1.41s
9:	learn: 0.1344222	total: 175ms	remaining: 1.26s
10:	learn: 0.1250736	total: 176ms	remaining: 1.14s
11:	learn: 0.1151375	total: 189ms	remaining: 1.1s
12:	learn: 0.1070586	total: 190ms	remaining: 1.01s
13:	learn: 0.1012418	total: 191ms	remaining: 929ms
14:	learn: 0.0965195	total: 193ms	remaining: 860ms
15:	learn: 0.0927579	total: 194ms	remaining: 799ms
16:	learn: 0.0883815	total: 195ms	remaining: 745ms
17:	learn: 0.0839436	total: 196ms	remaining: 697ms
18:	learn: 0.0816500	total: 197ms	remaining: 654ms
19:	learn: 0.0793390	total: 198ms	remaining:

73:	learn: 0.0323593	total: 123ms	remaining: 13.3ms
74:	learn: 0.0322530	total: 124ms	remaining: 11.6ms
75:	learn: 0.0319740	total: 126ms	remaining: 9.91ms
76:	learn: 0.0317016	total: 127ms	remaining: 8.23ms
77:	learn: 0.0312492	total: 128ms	remaining: 6.56ms
78:	learn: 0.0306046	total: 129ms	remaining: 4.91ms
79:	learn: 0.0299096	total: 131ms	remaining: 3.27ms
80:	learn: 0.0296454	total: 132ms	remaining: 1.63ms
81:	learn: 0.0293512	total: 134ms	remaining: 0us
2th fold train：0.999, valid：0.972
0:	learn: 0.5401986	total: 7.96ms	remaining: 644ms
1:	learn: 0.4199142	total: 9.23ms	remaining: 369ms
2:	learn: 0.3362923	total: 10.3ms	remaining: 270ms
3:	learn: 0.2694366	total: 11.4ms	remaining: 223ms
4:	learn: 0.2393559	total: 12.6ms	remaining: 193ms
5:	learn: 0.2079715	total: 13.7ms	remaining: 173ms
6:	learn: 0.1798279	total: 14.8ms	remaining: 159ms
7:	learn: 0.1612182	total: 15.9ms	remaining: 147ms
8:	learn: 0.1462141	total: 17.4ms	remaining: 141ms
9:	learn: 0.1311099	total: 18.7ms	remainin

In [329]:
model=GradientBoostingClassifier()
params={"learning_rate":0.11,"n_estimators":222}
train_pred_gbt,test_pred_gbt,valid_acc_gbt=train_model(model,params,X_train,y_train,X_test)

0th fold train：1.000, valid：0.961
1th fold train：1.000, valid：0.949
2th fold train：1.000, valid：0.972
3th fold train：1.000, valid：0.966
4th fold train：1.000, valid：0.978
avg train score= 1.000, avg valid score=0.965


In [330]:
model=RandomForestClassifier()
params={"n_estimators":150,"max_depth":4,"min_samples_split":5,"min_samples_leaf":3}
train_pred_rf,test_pred_rf,valid_acc_rf=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.971, valid：0.955
1th fold train：0.971, valid：0.966
2th fold train：0.971, valid：0.978
3th fold train：0.973, valid：0.961
4th fold train：0.965, valid：0.978
avg train score= 0.970, avg valid score=0.967


In [331]:
model=LogisticRegression()
params={}
train_pred_lr,test_pred_lr,valid_acc_lr=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.968, valid：0.961
1th fold train：0.975, valid：0.955
2th fold train：0.969, valid：0.972
3th fold train：0.972, valid：0.966
4th fold train：0.971, valid：0.983
avg train score= 0.971, avg valid score=0.967


In [332]:
model=LinearSVC()
params={"C":0.12,"tol":1e-4,"max_iter":1000}
train_pred_svc,test_pred_svc,valid_acc_svc=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.969, valid：0.961
1th fold train：0.975, valid：0.955
2th fold train：0.969, valid：0.972
3th fold train：0.972, valid：0.966
4th fold train：0.966, valid：0.978
avg train score= 0.970, avg valid score=0.966


In [333]:
model=KNeighborsClassifier()
params={"n_neighbors":15,"p":1}
train_pred_knn,test_pred_knn,valid_acc_knn=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.920, valid：0.877
1th fold train：0.913, valid：0.882
2th fold train：0.910, valid：0.938
3th fold train：0.909, valid：0.910
4th fold train：0.927, valid：0.904
avg train score= 0.916, avg valid score=0.902


In [334]:
model=Perceptron()
params={"penalty":'l2',"alpha":1e-5,"max_iter":19,"validation_fraction":0.1,"n_iter_no_change":10,"warm_start":True}
train_pred_pt,test_pred_pt,valid_acc_pt=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.920, valid：0.916
1th fold train：0.942, valid：0.916
2th fold train：0.948, valid：0.961
3th fold train：0.864, valid：0.854
4th fold train：0.934, valid：0.927
avg train score= 0.922, avg valid score=0.915


In [336]:
model=SGDClassifier()
params={"loss":'log',"alpha":1e-4,"max_iter":20,"validation_fraction":0.1,"n_iter_no_change":10,"warm_start":True}
train_pred_sgd,test_pred_sgd,valid_acc_sgd=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.914, valid：0.894
1th fold train：0.924, valid：0.944
2th fold train：0.947, valid：0.961
3th fold train：0.955, valid：0.944
4th fold train：0.947, valid：0.938
avg train score= 0.937, avg valid score=0.936


In [337]:
model=GaussianProcessClassifier()
params={}
train_pred_gp,test_pred_gp,valid_acc_gp=train_model(model,params,X_train,y_train,X_test)

0th fold train：1.000, valid：0.816
1th fold train：1.000, valid：0.860
2th fold train：1.000, valid：0.860
3th fold train：1.000, valid：0.837
4th fold train：1.000, valid：0.820
avg train score= 1.000, avg valid score=0.838


In [338]:
model=BernoulliNB()
params={"alpha":20}
train_pred_by,test_pred_by,valid_acc_by=train_model(model,params,X_train,y_train,X_test)

0th fold train：0.930, valid：0.933
1th fold train：0.930, valid：0.933
2th fold train：0.927, valid：0.944
3th fold train：0.935, valid：0.910
4th fold train：0.930, valid：0.933
avg train score= 0.930, avg valid score=0.930


In [346]:
result1=(test_pred_lgb+test_pred_xgb+test_pred_cat+test_pred_gbt+test_pred_rf+test_pred_lr+test_pred_svc\
         +test_pred_knn+test_pred_pt+test_pred_sgd+test_pred_gp+test_pred_by)/12

result1_round=np.round(result1)
data.loc[(data['is_train']==0)&(data['Group_id']=='noGroup'),'Survived']=result1_round
data['Survived']=data['Survived'].astype('int64')

data.loc[data['is_train']==0,["PassengerId","Survived"]].to_csv('result_20220621_avg_nonWCG.txt',index=False,header=True)

In [343]:
temp.iloc[[ 21, 214, 339]]

Unnamed: 0,PassengerId,Survived,Survived3
21,913,1.0,0
214,1106,0.0,1
339,1231,1.0,0


In [344]:
data.loc[data['PassengerId'].isin([ 913,1106, 1231])]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,is_train,Group_id,WC_count,...,Age_range3,boy1,boy2,boy3,female1,female2,female3,male1,male2,male3
912,913,1.0,3,9.0,0,1,3.1708,0,noGroup,1.0,...,0,0,0,1,0,0,0,0,0,0
1105,1106,0.0,3,38.0,4,2,7.775,0,noGroup,1.0,...,0,0,0,0,0,0,1,0,0,0
1230,1231,1.0,3,45.825002,0,0,7.2292,0,noGroup,1.0,...,1,0,0,1,0,0,0,0,0,0


In [55]:
nonWCG_pred=model.predict(X_test)
data.loc[(data['is_train']==0)&(data['Sex']=='male')&(data['Group_id']=='noGroup'),'Survived']=nonWCG_pred

In [60]:
data['Survived']=data['Survived'].astype('int64')
data.loc[data['is_train']==0,["PassengerId","Survived"]].to_csv('result_20220622_WCGLgb.txt',index=False,header=True)
# pd.DataFrame(data=np.c_[list(range(892,1310)),data.loc[data['is_train']==0,"Survived"]],columns=["PassengerId","Survived"],dtype=np.int64).to_csv('result_20220621_WCGAndModel.txt',index=False,header=True)# 得分：0.73444

In [61]:
sex3=pd.read_csv("./result_20220622_WCGLgb.txt")
sex3WCG=pd.read_csv("./result_20220621_avg_nonWCG.txt")
temp=pd.concat([sex3WCG,sex3['Survived'].rename('Survived3')],axis=1)
np.where(temp['Survived']!=temp['Survived3'])# 4个pclass为1，2的boy预测为1

(array([ 21, 214, 339], dtype=int64),)

In [71]:
data.loc[data['PassengerId'].isin([ 913,1106, 1231])]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_train,Title,Surname,Ticket_id,Group_id,WC_count,WCSurvived
912,913,0.0,3,"Olsen, Master. Artur Karl",boy,9.0,0,1,C 17368,3.1708,,S,0,Master,Olsen,3-C 1736-3.1708-S,noGroup,1.0,
1105,1106,1.0,3,"Andersson, Miss. Ida Augusta Margareta",female,38.0,4,2,347091,7.775,,S,0,Miss,Andersson,3-34709-7.775-S,noGroup,1.0,
1230,1231,0.0,3,"Betros, Master. Seman",boy,,0,0,2622,7.2292,,C,0,Master,Betros,3-262-7.2292-C,noGroup,1.0,


In [62]:
temp.iloc[[ 21, 214, 339]]

Unnamed: 0,PassengerId,Survived,Survived3
21,913,1,0
214,1106,0,1
339,1231,1,0


In [63]:
data.loc[data['PassengerId'].isin([ 913,1106, 1231])]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,is_train,Group_id,...,Age_range3,boy1,boy2,boy3,female1,female2,female3,male1,male2,male3
912,913,0,3,boy,9.0,0,1,3.1708,0,noGroup,...,0,0,0,1,0,0,0,0,0,0
1105,1106,1,3,female,38.0,4,2,7.775,0,noGroup,...,0,0,0,0,0,0,1,0,0,0
1230,1231,0,3,boy,,0,0,7.2292,0,noGroup,...,1,0,0,1,0,0,0,0,0,0


In [326]:
data.loc[data['PassengerId'].isin([ 913,1106, 1231])]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,is_train,Group_id,WC_count,...,Age_range3,boy1,boy2,boy3,female1,female2,female3,male1,male2,male3
912,913,1,3,9.0,0,1,3.1708,0,noGroup,1.0,...,0,0,0,1,0,0,0,0,0,0
1016,1017,0,3,17.0,0,1,16.1,0,noGroup,1.0,...,0,0,0,0,0,0,1,0,0,0
1230,1231,1,3,45.825002,0,0,7.2292,0,noGroup,1.0,...,1,0,0,1,0,0,0,0,0,0
1267,1268,0,3,22.0,2,0,8.6625,0,noGroup,1.0,...,0,0,0,0,0,0,1,0,0,0


In [298]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 46 columns):
 #   Column       Non-Null Count  Dtype           
---  ------       --------------  -----           
 0   PassengerId  1309 non-null   int64           
 1   Survived     1309 non-null   int64           
 2   Pclass       1309 non-null   float64         
 3   Age          1046 non-null   float64         
 4   SibSp        1309 non-null   int64           
 5   Parch        1309 non-null   int64           
 6   Fare         1309 non-null   float64         
 7   is_train     1309 non-null   int64           
 8   Group_id     1309 non-null   object          
 9   WC_count     527 non-null    float64         
 10  WCSurvived   210 non-null    float64         
 11  Cabins       1309 non-null   int64           
 12  familyNum    1309 non-null   int64           
 13  has_family   1309 non-null   int64           
 14  CabinA       1309 non-null   Sparse[uint8, 0]
 15  CabinB       1309 non