### Imports and Get Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

In [2]:
df = pd.read_csv('/Users/pawlodkowski/Downloads/train.csv') #change path to yours
X = df.loc[:, df.columns != 'Survived']
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

---
---



In [3]:
X_train.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
298,299,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
884,885,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
247,248,2,"Hamalainen, Mrs. William (Anna)",female,24.0,0,2,250649,14.5,,S


### Code Snippet 1

Mutable vs. Immutable Data Types

Pandas DataFrame is an example of a mutable data type. This means that the object can be changed in place.

In [4]:
def famsize(df):
    df = df.copy()
    df['Family_size'] = df['SibSp'] + df['Parch']
    df['Family_size_cat'] = np.where(df.Family_size == 0, 1, np.where( 
    df.Family_size == 1, 2, np.where(
    df.Family_size == 2, 2, np.where( 
    df.Family_size == 3, 3, np.where( 
    df.Family_size > 3, 4, 0)))))
    return df['Family_size_cat'].value_counts()

In [5]:
new = famsize(X_train)

Another example:

In [6]:
global_list = ['a', 'b', 'c']
second_list = global_list

In [7]:
def modify(li):
    return li.append('d')

In [8]:
modify(global_list)

In [9]:
second_list #weird!!

['a', 'b', 'c', 'd']

---

### Code Snippet 2

In [10]:
def k_binner(df, col_name):
    kbins = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='uniform')
    columns = df[[col_name]]
    kbins.fit(columns)
    t = kbins.transform(columns)
    t = t.todense()  

    # create nice labels
    edges = kbins.bin_edges_[0].round(1)
    labels = []
    for i in range(len(edges)-1):
        edge1 = edges[i]
        edge2 = edges[i+1]
        labels.append(f"{col_name} {edge1} to {edge2}")

    #create a DataFrame
    df_bins = pd.DataFrame(t, columns=labels)

    return df_bins

In [11]:
k_binner(X_train.copy(), 'SibSp').head(1)

Unnamed: 0,SibSp 0.0 to 1.6,SibSp 1.6 to 3.2,SibSp 3.2 to 4.8,SibSp 4.8 to 6.4,SibSp 6.4 to 8.0
0,1.0,0.0,0.0,0.0,0.0


In [12]:
k_binner(X_test.copy(), 'SibSp').head(1)

Unnamed: 0,SibSp 0.0 to 0.8,SibSp 0.8 to 1.6,SibSp 1.6 to 2.4,SibSp 2.4 to 3.2,SibSp 3.2 to 4.0
0,0.0,1.0,0.0,0.0,0.0


Key Idea:
- We only want to fit on the training data, and only transform on the testing data.
- IF the transformer is ALREADY FIT, then do `.transform()`. If the transformer is NOT YET FIT, then do `.fit()` then `.transform()` (or `.fit_transform()`)
- this is a good use case for sticking with FunctionTransformers, Pipelines, ColumnTransforms all the way through.

### Code Snippet 3

In [13]:
def deck_df(df:pd.DataFrame) -> pd.DataFrame: #type annotations
    df = df.copy()
    return pd.DataFrame(df['Cabin'].str[0])

def deck_series(ser:pd.Series) -> pd.Series: #type annotations
    ser = ser.copy()
    return ser.str[0]

In [14]:
ft = FunctionTransformer(deck_series)

In [15]:
ft.fit_transform(X_train['Cabin'])

298      C
884    NaN
247    NaN
478    NaN
305      C
      ... 
106    NaN
270    NaN
860    NaN
435      B
102      D
Name: Cabin, Length: 668, dtype: object

### Code Snippet 4

In [16]:
test_string = 'Saalfeld, Mr. Adolphe'

In [17]:
test_string.split(',')[1].strip().split('.')[0].strip()

'Mr'

In [18]:
def Name_Title(X_train):
    """Original Function"""
    X_train['Name'] = X_train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
    return X_train

Cleaned up Function:

In [19]:
def Name_Title(df): 
    #1. give argument more general name
    df = df.copy() #2. Make copy to avoid in-place modification "bug"
    df['Name'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip()) #3. keep syntax consistent
    return df

In [20]:
def name_title(s):  #4. use snake_case  
    #5. define the function separately so that it's easier to read (rather than cramming in a long lambda function in a single line of code)
    first_names = s.split(',')[1].strip()
    title = first_names.split('.')[0]
    return title

In [21]:
X_train['Name'].apply(name_title) 

298        Mr
884        Mr
247       Mrs
478        Mr
305    Master
        ...  
106      Miss
270        Mr
860        Mr
435      Miss
102        Mr
Name: Name, Length: 668, dtype: object

---

**BONUS:** If we still want to do everything in a single line of code and package it into a transformer, we could do something like this:

In [22]:
def transform_titles(series:pd.Series):
    """Example of Python Closure, i.e. function inside a function"""
    
    def extract_single_title(s:str):
        """Extract and return passenger title from the full name (string)"""
        first_names = s.split(',')[1].strip()
        title = first_names.split('.')[0].strip()
        return title
    
    return series.apply(extract_single_title)

In [23]:
title_transformer = FunctionTransformer(transform_titles)
title_transformer.fit_transform(X_train['Name'])

298        Mr
884        Mr
247       Mrs
478        Mr
305    Master
        ...  
106      Miss
270        Mr
860        Mr
435      Miss
102        Mr
Name: Name, Length: 668, dtype: object