In [201]:
import os
import re
import numpy as np
import pandas as pd

In [202]:
train_test_path = os.path.dirname(os.path.abspath(""))
os.listdir(train_test_path)

['ML', 'test.csv', 'train.csv']

In [203]:
train_csv_parh = os.path.join(train_test_path, "train.csv")
test_csv_parh = os.path.join(train_test_path, "test.csv")

In [204]:
df = pd.read_csv(train_csv_parh)
df = df.set_index("PassengerId") # set this col as index
df_test = pd.read_csv(test_csv_parh)
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [205]:
# most missing values on Cabin column which is categorical type of data (better to drop it)
# missing values on age column will be restored by median/mean preprocessing
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [206]:
df["Cabin"].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

### feature engineering
* add new col FamilySize that is sum of SibSp and Parch
* drop numerical values from Cabin col
* extract titles from Name col
* imputate missing data on Age col

In [207]:
# FamilySize
df["FamilySize"] = df["SibSp"] + df["Parch"]
#drop SibSp and Parch cols
df = df.drop(columns=["SibSp", "Parch"])
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.2500,,S,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C85,C,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.9250,,S,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1000,C123,S,1
5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,211536,13.0000,,S,0
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,112053,30.0000,B42,S,0
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,W./C. 6607,23.4500,,S,3
890,1,1,"Behr, Mr. Karl Howell",male,26.0,111369,30.0000,C148,C,0


In [208]:
# observation of Cabin column
cabins = df["Cabin"].dropna()
cabins.head(40)

PassengerId
2              C85
4             C123
7              E46
11              G6
12            C103
22             D56
24              A6
28     C23 C25 C27
32             B78
53             D33
55             B30
56             C52
62             B28
63             C83
67             F33
76           F G73
89     C23 C25 C27
93             E31
97              A5
98         D10 D12
103            D26
111           C110
119        B58 B60
124           E101
125            D26
129          F E69
137            D47
138           C123
140            B86
149             F2
152             C2
167            E33
171            B19
175             A7
178            C49
184             F4
186            A32
194             F2
195             B4
196            B80
Name: Cabin, dtype: object

In [209]:
# dropping numerical data from Cabin col
pattern = re.compile(r'[a-zA-Z]+')
df_iter_copy = df.copy()
for n,i in enumerate(df_iter_copy["Cabin"], start=1):
    matches = []
    if type(i) == str:
        i = "".join(i.split(" ")).strip()
        matches = pattern.findall(i)
        matches = list(set(matches))[0] # remove duplicates
        if len(matches) > 1:
            matches = "Not sure"
        df.loc[n, "Cabin"] = matches
#         print(df["Cabin"].iloc[n])
    continue
df["Cabin"]

PassengerId
1      NaN
2        C
3      NaN
4        C
5      NaN
      ... 
887    NaN
888      B
889    NaN
890      C
891    NaN
Name: Cabin, Length: 891, dtype: object

In [210]:
#* fill NaN by new category "Not Sure"
df["Cabin"] = df["Cabin"].fillna("Not Sure")
df["Cabin"]

PassengerId
1      Not Sure
2             C
3      Not Sure
4             C
5      Not Sure
         ...   
887    Not Sure
888           B
889    Not Sure
890           C
891    Not Sure
Name: Cabin, Length: 891, dtype: object

In [211]:
df.isnull().sum()

Survived        0
Pclass          0
Name            0
Sex             0
Age           177
Ticket          0
Fare            0
Cabin           0
Embarked        2
FamilySize      0
dtype: int64

In [212]:
# extract titles from Name col
all_titles = ["Dr." , "Mr.", "Mrs.", "Ms.", "Miss.", 
              "Master.", "Rev.", "Mlle.", "Major."]
positions, titles = [], []
for n, i in enumerate(df["Name"].str.split(), start=1):
        for title in all_titles:
            if title in i:
                positions.append(n)
                titles.append(title)

In [213]:
df["Name"].value_counts()

Braund, Mr. Owen Harris                     1
Boulos, Mr. Hanna                           1
Frolicher-Stehli, Mr. Maxmillian            1
Gilinski, Mr. Eliezer                       1
Murdlin, Mr. Joseph                         1
                                           ..
Kelly, Miss. Anna Katherine "Annie Kate"    1
McCoy, Mr. Bernard                          1
Johnson, Mr. William Cahoone Jr             1
Keane, Miss. Nora A                         1
Dooley, Mr. Patrick                         1
Name: Name, Length: 891, dtype: int64

In [214]:
# create new col and populate by positons and titles
df.loc[positions, "Title"] = titles
df["Title"]

PassengerId
1        Mr.
2       Mrs.
3      Miss.
4       Mrs.
5        Mr.
       ...  
887     Rev.
888    Miss.
889    Miss.
890      Mr.
891      Mr.
Name: Title, Length: 891, dtype: object

In [215]:
# drop rows in which title is not included in all_titles list
df = df[df["Title"].isin(all_titles) == True]

In [216]:
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,FamilySize,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.2500,Not Sure,S,1,Mr.
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,C,1,Mrs.
3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.9250,Not Sure,S,0,Miss.
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1000,C,S,1,Mrs.
5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.0500,Not Sure,S,0,Mr.
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,211536,13.0000,Not Sure,S,0,Rev.
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,112053,30.0000,B,S,0,Miss.
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,W./C. 6607,23.4500,Not Sure,S,3,Miss.
890,1,1,"Behr, Mr. Karl Howell",male,26.0,111369,30.0000,C,C,0,Mr.


In [217]:
# Imputation of Age missing values
from sklearn.impute import SimpleImputer
age_array = df["Age"].to_numpy()
median_imputer = SimpleImputer(strategy="median")
median_imputer.fit(age_array.reshape(-1,1))
median_imputer.statistics_

array([28.])

In [218]:
# Imputation of Age missing values
age_median_imputed = median_imputer.transform(df["Age"].to_numpy().reshape(-1,1))
age_median_imputed.reshape(-1)
print(df["Age"].shape, age_median_imputed.shape)
df.loc[:, "Age"] = age_median_imputed
df.isnull().sum()
df
# df.isnull().sum()

(882,) (882, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "Age"] = age_median_imputed


Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,FamilySize,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.2500,Not Sure,S,1,Mr.
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,C,1,Mrs.
3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.9250,Not Sure,S,0,Miss.
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1000,C,S,1,Mrs.
5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.0500,Not Sure,S,0,Mr.
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,211536,13.0000,Not Sure,S,0,Rev.
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,112053,30.0000,B,S,0,Miss.
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,W./C. 6607,23.4500,Not Sure,S,3,Miss.
890,1,1,"Behr, Mr. Karl Howell",male,26.0,111369,30.0000,C,C,0,Mr.


In [219]:
# isnull test
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
Ticket        0
Fare          0
Cabin         0
Embarked      2
FamilySize    0
Title         0
dtype: int64

In [220]:
# drop rows with unknown Embarked value
df = df.dropna()

In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 880 entries, 1 to 891
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    880 non-null    int64  
 1   Pclass      880 non-null    int64  
 2   Name        880 non-null    object 
 3   Sex         880 non-null    object 
 4   Age         880 non-null    float64
 5   Ticket      880 non-null    object 
 6   Fare        880 non-null    float64
 7   Cabin       880 non-null    object 
 8   Embarked    880 non-null    object 
 9   FamilySize  880 non-null    int64  
 10  Title       880 non-null    object 
dtypes: float64(2), int64(3), object(6)
memory usage: 82.5+ KB


In [222]:
# drop Name column
df = df.drop(columns="Name")
df = df.drop(columns="Ticket")

### Data transformation
* better use ColumnTransformer for df object
* normalize numerical columns (StandartScalar)
* encode string categorical data (OneHotEncoder)
* encode string categorical data (LabelEncoding) when ranking makes sense

#### categorical data ["Sex", "Ticket", "Cabin", "Embarked", "FamilySize", "Title"]
#### numerical data ["Age", "Fare" ]

In [223]:
# "Pclass" doesn't require any preprocessing
categorical_cols = ["Sex", "Embarked", "FamilySize", "Title"]
numerical_cols = ["Age", "Fare" ]
ranking_cols = "Cabin"

In [224]:
# define classes to preprocess numerical and categorical data
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
scalar_encoder = StandardScaler()
oh_encoder = OneHotEncoder()
label_encoder = LabelEncoder()

In [225]:
def create_transformer(dtype, cols):
    if dtype == "cat":
        encoder = oh_encoder
    elif dtype == "num":
        encoder = scalar_encoder
    elif dtype == "rank":
        encoder = label_encoder
    return (dtype, encoder, cols)

In [226]:
categorical = create_transformer("cat", categorical_cols)
numerical = create_transformer("num", numerical_cols)
ranking = create_transformer("rank", ranking_cols)

In [227]:
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([categorical, numerical, ranking])
ct

In [228]:
arr_prepared = ct.fit_transform(df)
ct.get_feature_names_out()

TypeError: LabelEncoder.fit_transform() takes 2 positional arguments but 3 were given

In [176]:
df_prepared.toarray(), ct.get_feature_names_out()

(array([[ 0.        ,  1.        ,  0.        , ...,  0.        ,
         -0.55542426, -0.49573753],
        [ 1.        ,  0.        ,  0.        , ...,  0.        ,
          0.68922988,  0.78914626],
        [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         -0.24426072, -0.48219307],
        ...,
        [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         -0.08867896, -0.17067046],
        [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         -0.24426072, -0.03923902],
        [ 0.        ,  1.        ,  0.        , ...,  0.        ,
          0.22248458, -0.4857046 ]]),
 array(['cat__Sex_female', 'cat__Sex_male', 'cat__Cabin_A', 'cat__Cabin_B',
        'cat__Cabin_C', 'cat__Cabin_D', 'cat__Cabin_E', 'cat__Cabin_F',
        'cat__Cabin_G', 'cat__Cabin_Not Sure', 'cat__Cabin_Not sure',
        'cat__Cabin_T', 'cat__Embarked_C', 'cat__Embarked_Q',
        'cat__Embarked_S', 'cat__FamilySize_0', 'cat__FamilySize_1',
        'cat__FamilySize

In [177]:
new_df = pd.DataFrame(df_prepared.toarray(), columns=ct.get_feature_names_out())
new_df

Unnamed: 0,cat__Sex_female,cat__Sex_male,cat__Cabin_A,cat__Cabin_B,cat__Cabin_C,cat__Cabin_D,cat__Cabin_E,cat__Cabin_F,cat__Cabin_G,cat__Cabin_Not Sure,...,cat__Title_Major.,cat__Title_Master.,cat__Title_Miss.,cat__Title_Mlle.,cat__Title_Mr.,cat__Title_Mrs.,cat__Title_Ms.,cat__Title_Rev.,num__Age,num__Fare
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.555424,-0.495738
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.689230,0.789146
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.244261,-0.482193
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.455857,0.424283
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.455857,-0.479685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.166470,-0.380359
876,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.788797,-0.039239
877,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.088679,-0.170670
878,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.244261,-0.039239


In [164]:
ct_test_num = ColumnTransformer([("num", StandardScaler(), ["Age", "Fare"])])

In [165]:
ct_test_cat = ColumnTransformer([("cat", OneHotEncoder(), ["Sex"])])

In [174]:
prep_test_num = ct_test_num.fit_transform(df)
prep_test_num, ct_test_num.get_feature_names_out()

(array([[-0.55542426, -0.49573753],
        [ 0.68922988,  0.78914626],
        [-0.24426072, -0.48219307],
        ...,
        [-0.08867896, -0.17067046],
        [-0.24426072, -0.03923902],
        [ 0.22248458, -0.4857046 ]]),
 array(['num__Age', 'num__Fare'], dtype=object))

In [175]:
prep_test_cat = ct_test_cat.fit_transform(df)
prep_test_cat, ct_test_cat.get_feature_names_out()

(array([[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [0., 1.]]),
 array(['cat__Sex_female', 'cat__Sex_male'], dtype=object))

In [170]:
sp_df = pd.DataFrame(df_prep_test_both)
sp_df

Unnamed: 0,0,1,2,3
0,-0.555424,-0.495738,0.0,1.0
1,0.689230,0.789146,1.0,0.0
2,-0.244261,-0.482193,1.0,0.0
3,0.455857,0.424283,1.0,0.0
4,0.455857,-0.479685,0.0,1.0
...,...,...,...,...
875,-0.166470,-0.380359,0.0,1.0
876,-0.788797,-0.039239,1.0,0.0
877,-0.088679,-0.170670,1.0,0.0
878,-0.244261,-0.039239,0.0,1.0
