In [None]:
import pandas as pd

titanic = pd.read_csv('titanic.csv')

# manually create a demographics dataset to merge later on the passengerID column
demographics_data = {
    'PassengerId': titanic['PassengerId'],
    'AgeGroup': ['adult', 'senior', 'teen', 'child', 'baby'] * (len(titanic) // 5) + ['adult'],
    'Income': [30000, 45000, 15000, 10000, 50000] * (len(titanic) // 5) + [30000],
    'EducationLevel': ['High School', 'Bachelor', 'Master', 'PhD', 'uneducated'] * (len(titanic) // 5) + ['High School']
}

demographics = pd.DataFrame(demographics_data)

#1. pd.merge(): to vertically merge different datsets together
titanic = pd.merge(titanic, demographics, on='PassengerId', how='left')

titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,Income,EducationLevel
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,30000,High School
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,senior,45000,Bachelor
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,teen,15000,Master
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,child,10000,PhD
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,baby,50000,uneducated


In [None]:
#2. pd.concat(): to concatenate different dataset with same column: horizontal merge
train = titanic[:int(len(titanic) * 0.8)]
test = titanic[int(len(titanic) * 0.8):]
combined = pd.concat([train, test], axis=0)
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AgeGroup,Income,EducationLevel
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,adult,30000,High School
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,senior,45000,Bachelor
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,teen,15000,Master
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,child,10000,PhD
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,baby,50000,uneducated
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,708,1,1,"Calderhead, Mr. Edward Pennington",male,42.0,0,0,PC 17476,26.2875,E24,S,teen,15000,Master
708,709,1,1,"Cleaver, Miss. Alice",female,22.0,0,0,113781,151.5500,,S,child,10000,PhD
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C,baby,50000,uneducated
710,711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24.0,0,0,PC 17482,49.5042,C90,C,adult,30000,High School


In [None]:
#3. pd.pivot_table(): aggregation of a variable based on another
pivot_table = pd.pivot_table(titanic, values='Survived', index='Pclass', aggfunc='mean')
pivot_table

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [None]:
#4. pd.melt(): transform data from a wide format to a long format (used for better plots)
melted = pd.melt(titanic, id_vars=['PassengerId', 'Survived'], value_vars=['Age', 'Fare'])
melted

Unnamed: 0,PassengerId,Survived,variable,value
0,1,0,Age,22.00
1,2,1,Age,38.00
2,3,1,Age,26.00
3,4,1,Age,35.00
4,5,0,Age,35.00
...,...,...,...,...
1777,887,0,Fare,13.00
1778,888,1,Fare,30.00
1779,889,0,Fare,23.45
1780,890,1,Fare,30.00


In [None]:
#5. pd.crosstab(): crosstabulation between any columns
crosstab = pd.crosstab(titanic['Pclass'], titanic['Survived'])
crosstab

Survived,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


In [None]:
#6. pd.cut(): value-based discrtization
titanic['AgeBin'] = pd.cut(titanic['Age'], bins=[0, 12, 20, 40, 60, 100], labels=['Child', 'Teen', 'Adult', 'Middle-aged', 'Senior'])
titanic['AgeBin']

0      Adult
1      Adult
2      Adult
3      Adult
4      Adult
       ...  
886    Adult
887     Teen
888      NaN
889    Adult
890    Adult
Name: AgeBin, Length: 891, dtype: category
Categories (5, object): ['Child' < 'Teen' < 'Adult' < 'Middle-aged' < 'Senior']

In [None]:
#7. pd.qcut(): quantile-based dicretization
titanic['FareBin'] = pd.qcut(titanic['Fare'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
titanic['FareBin']

0      Q1
1      Q4
2      Q2
3      Q4
4      Q2
       ..
886    Q2
887    Q3
888    Q3
889    Q3
890    Q1
Name: FareBin, Length: 891, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [None]:
#8. pd.df.apply(): apply any manually written function along any axis
def compute_family_size(row):
    return row['SibSp'] + row['Parch'] + 1

titanic['FamilySize'] = titanic.apply(compute_family_size, axis=1)
titanic['FamilySize']

0      2
1      2
2      1
3      2
4      1
      ..
886    1
887    1
888    4
889    1
890    1
Name: FamilySize, Length: 891, dtype: int64

In [None]:
#9. pd.df.transform(): usuallu used for normalization
titanic['AgeNormalized'] = titanic['Age'].transform(lambda x: (x - x.mean()) / x.std())
titanic['AgeNormalized']

0     -0.530005
1      0.571430
2     -0.254646
3      0.364911
4      0.364911
         ...   
886   -0.185807
887   -0.736524
888         NaN
889   -0.254646
890    0.158392
Name: AgeNormalized, Length: 891, dtype: float64

In [None]:
#10. pd.df.groupby().agg(): group the data by any number of variables and can also aggregate them
grouped = titanic.groupby(['Sex', 'Pclass']).agg({'Survived': 'mean'})
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Sex,Pclass,Unnamed: 2_level_1
female,1,0.968085
female,2,0.921053
female,3,0.5
male,1,0.368852
male,2,0.157407
male,3,0.135447


In [None]:
#11. pd.df.rolling(): calculate any rolling statistic (smoothing)
FareRollingMean = titanic['Fare'].rolling(window=3).mean()
FareRollingMean

0            NaN
1            NaN
2      28.819433
3      44.102767
4      23.025000
         ...    
886    16.391667
887    24.041667
888    22.150000
889    27.816667
890    20.400000
Name: Fare, Length: 891, dtype: float64

In [None]:
#12. pd.df.expanding(): calculate any expanding statistic (cumulative fin. return)
FareExpandingSum = titanic['Fare'].expanding().sum()
FareExpandingSum

0          7.2500
1         78.5333
2         86.4583
3        139.5583
4        147.6083
          ...    
886    28602.7493
887    28632.7493
888    28656.1993
889    28686.1993
890    28693.9493
Name: Fare, Length: 891, dtype: float64

In [None]:
#13. pd.df.ewm(): exponentially-weighted mean (give more weight to recent changes, lag reduction)
FareEWM = titanic['Fare'].ewm(span=10).mean()
FareEWM

0       7.250000
1      42.468315
2      28.582132
3      36.659677
4      28.446617
         ...    
886    17.891306
887    20.092887
888    20.703271
889    22.393585
890    19.731115
Name: Fare, Length: 891, dtype: float64

In [None]:
#14. pd.series.value_counts(): a more Pythonic way of using the count function with dataframes
pclass_counts = titanic['Pclass'].value_counts()
pclass_counts

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [None]:
#15. pd.df.pct_change(): percentage change between the current and a prior element (rows by default).
FarePctChange = titanic['Fare'].pct_change()
FarePctChange

0           NaN
1      8.832179
2     -0.888824
3      5.700315
4     -0.848399
         ...   
886   -0.553648
887    1.307692
888   -0.218333
889    0.279318
890   -0.741667
Name: Fare, Length: 891, dtype: float64

# 1. Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

features = ['Pclass', 'AgeNormalized', 'Fare', 'FamilySize', 'AgeBin', 'FareBin', 'AgeGroup', 'Income', 'EducationLevel']
x = titanic[features]
y = titanic['Survived']

#preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[ #median imputer for num NANs. Most frequent for cat NANs.
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['AgeNormalized', 'Fare', 'FamilySize', 'Income']),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder())]), ['Pclass', 'AgeBin', 'FareBin', 'AgeGroup', 'EducationLevel'])
    ])

#the ML pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=0)) #100 decision trees and fixed seed
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.7089552238805971


# 2. Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

titanic_data = pd.read_csv('titanic.csv')
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
#filling NANs
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)
titanic_data['Fare'].fillna(titanic_data['Fare'].median(), inplace=True)
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch'] + 1

#age and fare bins
titanic_data['AgeBin'] = pd.cut(titanic_data['Age'], bins=[0, 12, 20, 40, 60, 100]).cat.codes
titanic_data['FareBin'] = pd.qcut(titanic_data['Fare'], 4).cat.codes

#dropping cabin because it mostly all NANs and name and ticket because they are unnecessary
titanic_data.drop(columns=['Cabin'], inplace=True)
titanic_data.drop(columns=['Name', 'Ticket'], inplace=True)

In [None]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    object 
 9   FamilySize   891 non-null    int64  
 10  AgeBin       891 non-null    int8   
 11  FareBin      891 non-null    int8   
dtypes: float64(2), int64(6), int8(2), object(2)
memory usage: 71.5+ KB


In [None]:
numerical_features = ['Age', 'Fare', 'FamilySize', 'AgeBin', 'FareBin']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# the preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1))
])

x = titanic_data.drop(columns=['PassengerId', 'Survived'])
y = titanic_data['Survived']

# cross-validation eval
scores = cross_val_score(pipeline, x, y, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy: {scores.mean()}')

Cross-Validation Accuracy: 0.8372732408511707
