In [22]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import (
    ensemble,
    impute,
    model_selection,    
    preprocessing,
    tree,
)

url = (
    "http://biostat.mc.vanderbilt.edu/"
    "wiki/pub/Main/DataSets/titanic3.xls"
)
df = pd.read_excel(url)


In [3]:
X2 = pd.DataFrame(
    {
        "a": range(5),
        "b": [-100, -50, 0, 200, 1000],
    }
)
X2

Unnamed: 0,a,b
0,0,-100
1,1,-50
2,2,0
3,3,200
4,4,1000


In [4]:
from sklearn import preprocessing
std = preprocessing.StandardScaler()
std.fit_transform(X2)

array([[-1.41421356, -0.75995002],
       [-0.70710678, -0.63737744],
       [ 0.        , -0.51480485],
       [ 0.70710678, -0.02451452],
       [ 1.41421356,  1.93664683]])

In [5]:
std.scale_
std.mean_
std.var_

array([2.000e+00, 1.664e+05])

In [6]:
X_std = (X2 - X2.mean()) / X2.std()
X_std
X_std.mean()
X_std.std()

a    1.0
b    1.0
dtype: float64

In [7]:
X3 = X2.copy()
from fastai.structured import scale_vars
scale_vars(X3, mapper=None)
X3.std()
X3.mean()

ModuleNotFoundError: No module named 'fastai.structured'

In [8]:
from sklearn import preprocessing
mms = preprocessing.MinMaxScaler()
mms.fit(X2)
mms.transform(X2)

array([[0.        , 0.        ],
       [0.25      , 0.04545455],
       [0.5       , 0.09090909],
       [0.75      , 0.27272727],
       [1.        , 1.        ]])

In [9]:
(X2 - X2.min()) / (X2.max() - X2.min())

Unnamed: 0,a,b
0,0.0,0.0
1,0.25,0.045455
2,0.5,0.090909
3,0.75,0.272727
4,1.0,1.0


In [10]:
X_cat = pd.DataFrame(
    {
        "name": ["George", "Paul"],
        "inst": ["Bass", "Guitar"],
    }
)
X_cat

Unnamed: 0,name,inst
0,George,Bass
1,Paul,Guitar


In [11]:
pd.get_dummies(X_cat, drop_first=True)

Unnamed: 0,name_Paul,inst_Guitar
0,0,0
1,1,1


In [14]:
import janitor as jn
X_cat2 = pd.DataFrame(
    {
        "A": [1, None, 3],
        "names": [
            "Fred,George",
            "George",
            "John,Paul",
        ],
    }
)
jn.expand_column(X_cat2, "names", sep=",")

Unnamed: 0,A,names,Fred,George,John,Paul
0,1.0,"Fred,George",1,1,0,0
1,,George,0,1,0,0
2,3.0,"John,Paul",0,0,1,1


In [17]:
from sklearn import preprocessing
lab = preprocessing.LabelEncoder()
lab.fit_transform(X_cat.name)

array([0, 1])

In [18]:
lab.inverse_transform([1, 1, 0])

array(['Paul', 'Paul', 'George'], dtype=object)

In [19]:
X_cat.name.astype(
    "category"
).cat.as_ordered().cat.codes + 1

0    1
1    2
dtype: int8

In [20]:
mapping = X_cat.name.value_counts()
X_cat.name.map(mapping)

0    1
1    1
Name: name, dtype: int64

In [23]:
from collections import Counter
c = Counter()
def triples(val):
    for i in range(len(val)):
        c[val[i : i + 3]] += 1
df.name.apply(triples)
c.most_common(10)

[(', M', 1282),
 (' Mr', 954),
 ('r. ', 830),
 ('Mr.', 757),
 ('s. ', 460),
 ('n, ', 320),
 (' Mi', 283),
 ('iss', 261),
 ('ss.', 261),
 ('Mis', 260)]

In [24]:
df.name.str.extract(
    "([A-Za-z]+)\.", expand=False
).head()

0      Miss
1    Master
2      Miss
3        Mr
4       Mrs
Name: name, dtype: object

In [25]:
df.name.str.extract(
    "([A-Za-z]+)\.", expand=False
).value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Major         2
Mlle          2
Ms            2
Capt          1
Mme           1
Don           1
Countess      1
Jonkheer      1
Dona          1
Sir           1
Lady          1
Name: name, dtype: int64

In [26]:
import category_encoders as ce
he = ce.HashingEncoder(verbose=1)
he.fit_transform(X_cat)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,0,0,0,1,0,1,0,0
1,0,2,0,0,0,0,0,0


In [27]:
size_df = pd.DataFrame(
    {
        "name": ["Fred", "John", "Matt"],
        "size": ["small", "med", "xxl"],
    }
)
ore = ce.OrdinalEncoder(
    mapping=[
        {
            "col": "size",
            "mapping": {
                "small": 1,
                "med": 2,
                "lg": 3,
            },
        }
    ]
)
ore.fit_transform(size_df)

Unnamed: 0,name,size
0,Fred,1.0
1,John,2.0
2,Matt,-1.0


In [28]:
def get_title(df):
    return df.name.str.extract(
        "([A-Za-z]+)\.", expand=False
    )
te = ce.TargetEncoder(cols="Title")
te.fit_transform(
    df.assign(Title=get_title), df.survived
)["Title"].head()

0    0.676923
1    0.508197
2    0.676923
3    0.162483
4    0.786802
Name: Title, dtype: float64

In [29]:
from fastai.tabular.transform import (
    add_datepart,
)
dates = pd.DataFrame(
    {
        "A": pd.to_datetime(
            ["9/17/2001", "Jan 1, 2002"]
        )
    }
)
add_datepart(dates, "A")
dates.T

Unnamed: 0,0,1
AYear,2001,2002
AMonth,9,1
AWeek,38,1
ADay,17,1
ADayofweek,0,1
ADayofyear,260,1
AIs_month_end,False,False
AIs_month_start,False,True
AIs_quarter_end,False,False
AIs_quarter_start,False,True


In [30]:
from pandas.api.types import is_numeric_dtype
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (
            name in na_dict
        ):
            df[name + "_na"] = pd.isnull(col)
            filler = (
                na_dict[name]
                if name in na_dict
                else col.median()
            )
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
data = pd.DataFrame({"A": [0, None, 5, 100]})
fix_missing(data, data.A, "A", {})
data

Unnamed: 0,A,A_na
0,0.0,False
1,5.0,True
2,5.0,False
3,100.0,False


In [31]:
data = pd.DataFrame({"A": [0, None, 5, 100]})
data["A_na"] = data.A.isnull()
data["A"] = data.A.fillna(data.A.median())

In [32]:
agg = (
    df.groupby("cabin")
    .agg("min,max,mean,sum".split(","))
    .reset_index()
)
agg.columns = [
    "_".join(c).strip("_")
    for c in agg.columns.values
]
agg_df = df.merge(agg, on="cabin")