In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import lightgbm
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

# データ読み込み

In [2]:
# データの読み込み
df_train=pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")
df_test["Perished"] = np.nan
# trainデータとtestデータ結合
df=pd.concat([df_train, df_test])
df.reset_index(drop=True)

Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,1.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,0.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,0.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,0.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,1.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
df_train.isnull().sum()

PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
Perished       418
dtype: int64

# データ分析

In [5]:
# Embarkedごとの死亡率を見る
embarked_counts = df['Embarked'].value_counts()
death_rate = df.groupby('Embarked')['Perished'].mean()
# データフレームにまとめる
summary_df = pd.DataFrame({'Embarked Count': embarked_counts, 'Death Rate': death_rate})
summary_df

Unnamed: 0,Embarked Count,Death Rate
C,270,0.446429
Q,123,0.61039
S,914,0.663043


In [6]:
# チケットの先頭文字を取得
df['Ticketini'] = df['Ticket'].apply(lambda x: str(x)[0])
# Ticketiniごとの個数と死亡率を計算
ticket_counts = df['Ticketini'].value_counts()
death_rate = df.groupby('Ticketini')['Perished'].mean()
# データフレームにまとめる
summary_df = pd.DataFrame({'Ticket Count': ticket_counts, 'Death Rate': death_rate})
summary_df

Unnamed: 0,Ticket Count,Death Rate
1,210,0.369863
2,278,0.535519
3,429,0.760797
4,11,0.8
5,3,1.0
6,9,0.833333
7,13,0.888889
8,2,1.0
9,2,0.0
A,42,0.931034


In [7]:
# Cabinの頭文字ごとの死亡率を見る
df["Cabin_init"] = df["Cabin"].map(lambda x:str(x)[0])
cabin_init_counts = df['Cabin_init'].value_counts()
death_rate = df.groupby('Cabin_init')['Perished'].mean()
# データフレームにまとめる
summary_df = pd.DataFrame({'Cabin_init_Count': cabin_init_counts, 'Death Rate': death_rate})
summary_df

Unnamed: 0,Cabin_init_Count,Death Rate
A,22,0.533333
B,65,0.255319
C,94,0.40678
D,46,0.242424
E,41,0.25
F,21,0.384615
G,5,0.5
T,1,1.0
n,1014,0.700146


# 欠損値補完

In [8]:
df.isnull().sum()

PassengerId       0
Perished        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Ticketini         0
Cabin_init        0
dtype: int64

In [9]:
# Age欠損: Sex, Pclassでグループ分けして、それの中央値で補完
medians = df.groupby(['Sex', 'Pclass'])['Age'].median()
df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
# Fare欠損 (引用: https://3pysci.com/kaggle-titanic-8/)
df["Fare"] = df["Fare"].fillna(float(7.8875))
# Cabin欠損: Missingを表す"M"で補完。後の特徴量エンジニアリングに用いる。
df["Cabin"] = df["Cabin"].fillna("M")
# Embarked欠損: 最頻値であるSで埋める
df["Embarked"] = df["Embarked"].fillna("S")

# 特徴量エンジニアリング

In [10]:
# Embarkedを死ぬ確率が低いものから順にラベルエンコーディング
df["Embarked_num"] = df["Embarked"].map({"C": 0, "S": 1, "Q": 2})
# Sexをラベルエンコーディング
df["Sex_num"] = df["Sex"].map({"female": 0, "male": 1})
# FamilySizeを追加
df["FamilySize"] = df["Parch"] + df["SibSp"] + 1

# Ticketの頭文字(Ticketini)と文字列の長さ(Ticketlen)を追加 (参考: https://lp-tech.net/articles/0QUUd?page=2)
df['Ticketini'] = df['Ticket'].apply(lambda x: str(x)[0])
# 死んでいる確率が低いものから順に0, 1, 2と番号付け
df['Ticketini'] = np.where((df['Ticketini']).isin(['1', '2', "9", "C", "F", 'P', 'S']), df['Ticketini'], np.where((df['Ticketini']).isin(["3", '4', '5', '6', '7',"8", "A", 'L','W']), 2,2))
df['Ticketini'] = df['Ticketini'].replace("1",0).replace("2",1).replace("9",0).replace("C",1).replace("F",0).replace("P",0).replace("S",1)
df["Ticketlen"] = df["Ticket"].map(lambda x: len(str(x)))

# Nameから敬称を取り出し、Salutationとして新たに追加
title_list = []
def extract_title(name):
    for name_split in name.split(): # 名前を分割
        if name_split.endswith("."): # もし分割した名前の終わりがピリオドで
            if name_split != "L.": #もし分割した名前が"L."でなかったら
                title_list.append(name_split)
                return name_split  # 敬称を返す
    return "" # 敬称が見つからなかった場合、空文字を返す
df['Salutation'] = df['Name'].apply(extract_title)

# 各敬称(Salutation)を数字でグループ分けしてhonor列に新たに追加（引用: https://banga-heavy.com/kaggle%E3%82%BF%E3%82%A4%E3%82%BF%E3%83%8B%E3%83%83%E3%82%AF%E3%83%87%E3%83%BC%E3%82%BF%E3%81%A781-100-lightgbmxoptunax%E4%BA%A4%E5%B7%AE%E6%A4%9C%E8%A8%BC/）
df["honor"] = df["Salutation"].copy()
df["honor"]=df["honor"].replace(("Don.","Rev.","Jonkheer.","Capt."),1)
df["honor"]=df["honor"].replace(("Mr."),2)
df["honor"]=df["honor"].replace(("Master.","Dr.","Major.","Col."),3)
df["honor"]=df["honor"].replace(("Miss.",),4)
df["honor"]=df["honor"].replace(("Mrs.","Mme.","Ms.","Lady.","Sir.","Mlle.","Countess.","Dona."),5)

# Cabinの頭文字を取得 (Mはもともと欠損していたことを表す。)
df["Cabin_init"] = df["Cabin"].map(lambda x:str(x)[0])
# Cabinの頭文字がTの人はPclass = 1だったので、同様にPclass = 1であるAに置き換え。
df.loc[df["Cabin_init"] == "T", "Cabin_init"] = "A"
# 死ぬ確率が同等だったものでグループ化
df.loc[df["Cabin_init"].isin(["A", "C"]), "Cabin_init"] = "AC"
df.loc[df["Cabin_init"].isin(["F", "G"]), "Cabin_init"] = "FG"
df.loc[df["Cabin_init"].isin(["B", "D", "E"]), "Cabin_init"] = "BDE"
# 死ぬ確率が低かったものから順に番号付け
df['Cabin_init'] = df['Cabin_init'].replace("BDE",0).replace("AC",1).replace("FG",2).replace("M",3)

# Nameの頭文字と長さを取得し、Nameini, Namelenとして新たに追加。Nameiniはラベルエンコーディング（引用: https://banga-heavy.com/kaggle%E3%82%BF%E3%82%A4%E3%82%BF%E3%83%8B%E3%83%83%E3%82%AF%E3%83%87%E3%83%BC%E3%82%BF%E3%81%A781-100-lightgbmxoptunax%E4%BA%A4%E5%B7%AE%E6%A4%9C%E8%A8%BC/）
df["Nameini"]=df["Name"].map(lambda x: str(x)[0])
df["Nameini"]=LabelEncoder().fit_transform(df["Nameini"])
df["Namelen"]=df["Name"].map(lambda x: len(str(x)))

# （Family_Survival列作成のコードに関して引用: https://www.kaggle.com/code/konstantinmasich/titanic-0-82-0-83）
# 名字を表す列Last_Nameを取得
df['Last_Name'] = df['Name'].apply(lambda x: str.split(x, ",")[0])
# 家族の死亡率を表す列Family_Survivalをデフォルトで作成。（0:生存, 1:死亡）
DEFAULT_SURVIVAL_VALUE = 0.5
df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
# Last_NameとFareでdfの特定列をグループ化。Last_NameとFareが一致していたら同じ家族である可能性が高い。grpはキーでgrp_dfはそのグループに属するdataframe
for grp, grp_df in df[['Perished','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId', 'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    # もしグループ内に1人以上いる場合
    if (len(grp_df) != 1):
        # 特定の家族グループ内の各乗客に対してループを開始
        for ind, row in grp_df.iterrows():
            # 自分以外の家族メンバーの乗客の死亡率
            smax = grp_df.drop(ind)['Perished'].max()
            smin = grp_df.drop(ind)['Perished'].min()
            passID = row['PassengerId']
            # もし家族内で少なくとも1人死んでいたらFmaily_Survivalを1にする
            if (smax == 1.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
# print("Number of passengers with family survival information:",
#       df.loc[df['Family_Survival']!=0.5].shape[0])

# dfをTicketでグループ分け
for _, grp_df in df.groupby('Ticket'):
    # グループに1人以上の乗客がいる時（チケットが同じなら同じCabin/ 家族?）
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            # Family_Survival の値が 0（誰かが生きている）または 0.5（デフォルト値）であるとき
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                # 自分以外の家族メンバーの乗客の死亡率
                smax = grp_df.drop(ind)['Perished'].max()
                smin = grp_df.drop(ind)['Perished'].min()
                passID = row['PassengerId']
                # もし家族内で少なくとも1人死んでいたらFmaily_Survivalを1にする
                if (smax == 1.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    df.loc[df['PassengerId'] == passID, 'Family_Survival'] = 0
# print("Number of passenger with family/group survival information: " +str(df[df['Family_Survival']!=0.5].shape[0]))

# FareとAgeをそれぞれ13, 10のブロックに分けてラベルエンコーディング (ヒントを得た参考: https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial)
df['FareBin'] = pd.qcut(df['Fare'], 13)
df['AgeBin'] = pd.qcut(df['Age'], 10)
# ラベルエンコーディング
label = LabelEncoder()
df['FareBin_Code'] = label.fit_transform(df['FareBin'])
df['AgeBin_Code'] = label.fit_transform(df['AgeBin'])

In [11]:
df.head()

Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Salutation,honor,Nameini,Namelen,Last_Name,Family_Survival,FareBin,AgeBin,FareBin_Code,AgeBin_Code
0,1,1.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,Mr.,2,1,23,Braund,0.5,"(-0.001, 7.25]","(21.0, 22.0]",0,2
1,2,0.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,Mrs.,5,2,51,Cumings,0.5,"(56.496, 83.475]","(34.0, 40.0]",11,7
2,3,0.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,Miss.,4,7,22,Heikkinen,0.5,"(7.896, 8.05]","(25.0, 26.0]",3,4
3,4,0.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,Mrs.,5,5,44,Futrelle,1.0,"(34.075, 56.496]","(34.0, 40.0]",10,7
4,5,1.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,Mr.,2,0,24,Allen,0.5,"(7.896, 8.05]","(34.0, 40.0]",3,7


In [12]:
df.columns

Index(['PassengerId', 'Perished', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Ticketini',
       'Cabin_init', 'Embarked_num', 'Sex_num', 'FamilySize', 'Ticketlen',
       'Salutation', 'honor', 'Nameini', 'Namelen', 'Last_Name',
       'Family_Survival', 'FareBin', 'AgeBin', 'FareBin_Code', 'AgeBin_Code'],
      dtype='object')

In [13]:
# trainとtestを分けて、モデルに使う特徴量を選ぶ。
# 後に示す特徴量の重要度なども考慮して選択する
train = df[:891]
test = df[891:]
cols=['Pclass', 'SibSp','Parch',
       'Embarked_num', 'Sex_num', 'FamilySize',
       'Ticketini', 'Ticketlen', 'honor',
       'Cabin_init', 'Namelen', 'Nameini',
       'Family_Survival', 'FareBin_Code', 'AgeBin_Code']
x = train[cols]
t = train.iloc[:,1:2]
test = test[cols]
print(x.columns)

Index(['Pclass', 'SibSp', 'Parch', 'Embarked_num', 'Sex_num', 'FamilySize',
       'Ticketini', 'Ticketlen', 'honor', 'Cabin_init', 'Namelen', 'Nameini',
       'Family_Survival', 'FareBin_Code', 'AgeBin_Code'],
      dtype='object')


In [14]:
# のちのモデル学習で用いるために.values変換
X_values = x.values
y_values=t.values

# 交差検証(CV) ・ 特徴量の重要度

In [15]:
# クロスバリデーション
# X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.01,random_state=0)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
# test_scoreを格納するdataflame
df_test_score = pd.DataFrame()
df_train_score = pd.DataFrame()
# 学習させた訓練モデルを格納する辞書。ここではデータを5分割するから一つのモデル名に対して5つの学習モデルが生成され、格納される。
dict_kfold_trained_models = {}
list_models = [
            LogisticRegression(),
            LinearRegression(),
            LinearSVC(),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            RandomForestClassifier(),
            GradientBoostingClassifier(),
            MLPClassifier(),
            XGBClassifier(),
            lightgbm.LGBMClassifier()
            ]
# list_models = [lightgbm.LGBMClassifier()]
for model in list_models:
    # modelの名前をstrとして変換し、printする
    model_name = type(model).__name__
    # print(f"=== {model_name} ===")
    for i, (train_index, test_index) in enumerate(skf.split(X_values, y_values)):
        X_train, y_train = X_values[train_index], y_values[train_index]
        X_valid, y_valid = X_values[test_index], y_values[test_index]
        model.fit(X_train, y_train)
        # 学習済みモデルを辞書に格納
        # もしmodel_nameをキーとしてそのモデルが存在していなかったらmodel_nameをキー、fitさせたmodelを値として追加する。
        if model_name not in dict_kfold_trained_models:
            dict_kfold_trained_models[model_name] = [model]
        # もしすでにmodel_nameをキーとしてそのモデルが存在していたら、model_nameをキー、fitさせたmodelを値として追加する。
        # キー = model_nameに対して値 = リストとすることで、一つのmodel_nameに対して複数のmodel(n_splits個数分)を格納できる
        else:
            dict_kfold_trained_models[model_name].append(model)
        test_score = model.score(X_valid, y_valid)
        train_score = model.score(X_train, y_train)
        df_test_score.at[i, model_name] = test_score
        df_train_score.at[i, model_name] = train_score
        # print(f"test score:{test_score:.3f}\t train score:{train_score:.3f}")

# 特徴量の重要度をdataframeにまとめる
# Feature importanceを格納するDataFrame
df_feature_importance = pd.DataFrame(index=x.columns)

for index, (model_name, models) in enumerate(dict_kfold_trained_models.items()):
            model = models[i]
            if model_name in ['DecisionTreeClassifier', 'RandomForestClassifier', 'GradientBoostingClassifier', 'XGBClassifier', 'LGBMClassifier']:
                try:
                    # 特徴量の重要度を取得
                    feature_importance = model.feature_importances_
                except AttributeError:
                # LightGBMの場合は、feature_importances_の代わりにfeature_importance()を使う
                    feature_importance = model.feature_importance(importance_type='gain')
                # DataFrameに格納
                df_feature_importance[model_name] = feature_importance

# 特徴量の重要度
df_feature_importance

Unnamed: 0,DecisionTreeClassifier,RandomForestClassifier,GradientBoostingClassifier,XGBClassifier,LGBMClassifier
Pclass,0.079395,0.056033,0.103414,0.119203,105
SibSp,0.01271,0.019247,0.003544,0.01052,27
Parch,0.014347,0.018372,0.001902,0.035411,18
Embarked_num,0.012951,0.02227,0.006367,0.018495,47
Sex_num,0.003554,0.132605,0.034742,0.021474,48
FamilySize,0.019627,0.035786,0.022659,0.024393,74
Ticketini,0.038567,0.044383,0.050946,0.054619,121
Ticketlen,0.037002,0.053621,0.049309,0.025145,203
honor,0.369841,0.175386,0.46826,0.479373,85
Cabin_init,0.02013,0.03407,0.014849,0.027914,57


In [16]:
# CV値(テストスコア)
df_test_score.describe()

Unnamed: 0,LogisticRegression,LinearRegression,LinearSVC,DecisionTreeClassifier,KNeighborsClassifier,RandomForestClassifier,GradientBoostingClassifier,MLPClassifier,XGBClassifier,LGBMClassifier
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.835007,0.417728,0.663304,0.787885,0.76203,0.845107,0.831668,0.828266,0.835032,0.833909
std,0.031488,0.055905,0.150731,0.024858,0.037631,0.026001,0.020435,0.029732,0.037718,0.029241
min,0.786517,0.334174,0.410112,0.747191,0.719101,0.814607,0.814607,0.797753,0.780899,0.803371
25%,0.825843,0.392124,0.659218,0.782123,0.724719,0.820225,0.815642,0.797753,0.821229,0.814607
50%,0.843575,0.433182,0.696629,0.797753,0.775281,0.854749,0.820225,0.837079,0.831461,0.821229
75%,0.848315,0.453728,0.758427,0.803371,0.793296,0.865169,0.853933,0.843575,0.870787,0.865169
max,0.870787,0.475432,0.792135,0.808989,0.797753,0.870787,0.853933,0.865169,0.870787,0.865169


# モデル学習: LightGBM

In [17]:
# (参考: https://lp-tech.net/articles/JsWwf）
# 各パラメータは自身で何回も試行を繰り返し、CV値やKaggleスコアを見ながら調整した
X_train,X_valid,y_train,y_valid = train_test_split(X_values,y_values,test_size=0.0075,random_state=0)

lgb_train = lgb.Dataset(np.array(X_train),np.array(y_train))
lgb_eval = lgb.Dataset(np.array(X_valid),np.array(y_valid),reference=lgb_train)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2'},
    'num_leaves': 200,
    'learning_rate': 0.020,
    'num_iterations':90,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.9,
    'bagging_freq': 7,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
                early_stopping_rounds=1000)
pred= gbm.predict(np.array(test), num_iteration=gbm.best_iteration)

# 予測結果はfloat(小数)で出てくるので、範囲指定してバイナリ変数(2進数)に変換
pred[pred >= 0.50] = 1
pred[pred < 0.50] = 0
# なぜか1, 0がいつもfloatになっていたのでint(整数)に変換
pred = pred.astype(int)
# 「kaggle titanic」ではtarget変数が真逆なので変換
pred_kaggle = abs(pred-1)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l2: 0.275965
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l2: 0.270537
[3]	valid_0's l2: 0.266448
[4]	valid_0's l2: 0.261662
[5]	valid_0's l2: 0.257437
[6]	valid_0's l2: 0.252516
[7]	valid_0's l2: 0.247623
[8]	valid_0's l2: 0.242594
[9]	valid_0's l2: 0.2399
[10]	valid_0's l2: 0.237109
[11]	valid_0's l2: 0.232791
[12]	valid_0's l2: 0.229423
[13]	valid_0's l2: 0.226569
[14]	valid_0's l2: 0.223013
[15]	valid_0's l2: 0.217839
[16]	valid_0's l2: 0.21534
[17]	valid_0's l2: 0.210764
[18]	valid_0's l2: 0.208548
[19]	valid_0's l2: 0.203771
[20]	valid_0's l2: 0.199465
[21]	valid_0's l2: 0.195952
[22]	valid_0's l2: 0.191656
[23]	valid_0's l2: 0.189315
[24]	valid_0's l2: 0.187384
[25]	valid_0's l2: 0.187037
[26]	valid_0's l2: 0.183698
[27]	valid_0's l2: 0.182216
[28]	valid_0's l2: 0.180368
[29]	valid_0's l2: 0.178703
[30]	valid_0's 

In [18]:
#　ファイル名を指定
filename = "0811_Mayumi_Nakano"

filename_for_GCI = filename+("_GCI")
filename_for_kaggle = filename+("_kaggle")


# GCI用
submission = pd.read_csv("gender_submission.csv")
submission['Perished'] = pred
submission.to_csv(f"{filename_for_GCI}.csv", index=False)
# Kaggle用
submission_kaggle = pd.read_csv("gender_submission.csv")
submission_kaggle.rename(columns={'Perished': 'Survived'}, inplace=True)
submission_kaggle['Survived'] = pred_kaggle
submission_kaggle.to_csv(f"{filename_for_kaggle}.csv", index=False)