# init

In [1]:
import numpy as np
import pandas as pd

# load

In [2]:
path_data = "../data/"
path_raw = path_data + "raw/"
path_mid = path_data + "mid/"
path_clns = path_data + "clns/"

In [3]:
cats = pd.read_csv(path_mid+"cats.csv", index_col=0)
nums = pd.read_csv(path_mid+"nums.csv", index_col=0)
bools = pd.read_csv(path_mid+"bools.csv", index_col=0)

# clns

## fillna

##### embarked

In [4]:
cats["embarked"] = cats["embarked"].fillna("S")
cats["embarked"].isna().any()

False

##### age

In [5]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [6]:
fill_mean = nums[["age", "survived"]]
fill_mean = fill_mean[~fill_mean.survived.isna()]
fill_mean["age"] = fill_mean.age.fillna(fill_mean.age.mean())

In [7]:
fill_median = nums[["age", "survived"]]
fill_median = fill_median[~fill_median.survived.isna()]
fill_median["age"] = fill_median.age.fillna(fill_median.age.median())

In [8]:
# 線形回帰で値埋め
def zscore(x):
    m = x.mean()
    s = x.std(ddof=1)
    return (x-m)/s

X = pd.get_dummies(cats, drop_first=True)
z = zscore(nums.drop(["age", "survived"], 1))
X = X.join(z)
y = nums.age

is_na = y.isna()
y = np.log1p(y)

rgs = LinearRegression()
rgs.fit(X[~is_na], y[~is_na])

pred = rgs.predict(X[is_na])
pred = np.exp(pred)-1

base = X[~is_na]
base["age"] = np.exp(y[~is_na])-1

fill = X[is_na]
fill["age"] = pred

fill_linear = pd.concat([base, fill]).join(nums[["survived"]])
fill_linear = fill_linear[["age", "survived"]]
fill_linear = fill_linear[~fill_linear.survived.isna()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
def check_auc(data):
    X, y = data[["age"]], data["survived"]
    clf = LogisticRegression()
    clf.fit(X, y)
    proba = clf.predict_proba(X)[:,1]
    auc = roc_auc_score(y_true=y, y_score=proba)
    print(auc)

In [10]:
check_auc(fill_mean)

0.5329419918511361


In [11]:
check_auc(fill_median)

0.5290565799974796


In [12]:
check_auc(fill_linear)

0.5316398538245055


In [13]:
nums["age"] = nums.fillna(nums.age.mean())

## union-value

In [14]:
cats_union = cats.copy()

##### embarked

In [15]:
cats_union["embarked"] = cats_union.embarked.replace(["Q", "S"], "QorS")

## family-size

In [16]:
family = nums.parch + nums.sibsp
parch = nums.parch.apply(lambda x: "4+" if x >= 4 else x)
sibsp = nums.sibsp.apply(lambda x: "4+" if x >= 4 else x)

# feature engineering

## onehot-encoding

In [17]:
pd.get_dummies(cats_union, drop_first=True).to_csv(path_clns+"onehot_cats.csv")

In [18]:
pd.get_dummies(pd.concat([parch, sibsp], axis=1), drop_first=True).to_csv(path_clns+"onehot_parch_sibsp.csv")

In [19]:
pd.get_dummies(family, drop_first=True, prefix="family-size").to_csv(path_clns+"onehot_familysize.csv")

In [20]:
(bools*1).to_csv(path_clns+"onehot_bools.csv")

In [21]:
nums[["survived"]].to_csv(path_clns+"y.csv")

In [22]:
is_child = (nums.age <= 7)*1
is_child.name = "is_child"
is_child.to_frame().to_csv(path_clns+"onehot_ischild.csv")

## target-encoding

In [23]:
def tgt_encoding(data, y):
    data = data.copy()
    idname = data.index.name
    data = data.reset_index()
    train = data.dropna()

    for x in set(data)-set([idname, y]):
        dfg = train.groupby(x)[y].mean()
        dfg = dfg.to_frame()
        data = data.merge(dfg, on=x, suffixes=["", "_%s_tgt"%x], how="left")

    data = data.set_index(idname)
    data = data.filter(regex="_tgt")
    return data

In [24]:
y = "survived"

In [25]:
data = cats.join(nums[[y]])
data = tgt_encoding(data, y)
data.to_csv(path_clns+"tgt_cats.csv")

In [26]:
data = bools.join(nums[[y]])
data = tgt_encoding(data, y)
data.to_csv(path_clns+"tgt_bools.csv")

In [27]:
data = pd.concat([parch, sibsp], axis=1).join(nums[[y]])
data = tgt_encoding(data, y)
data.to_csv(path_clns+"tgt_parch_sibsp.csv")

In [28]:
data = family.to_frame()
data.columns = ["familysize"]
data = data.join(nums[[y]])
data = tgt_encoding(data, y)
data.to_csv(path_clns+"tgt_familysize.csv")

## log, zscore

In [29]:
def zscore(x):
    m = x.mean()
    s = x.std(ddof=1)
    return (x-m)/s

In [30]:
nums_tgt = nums[["age", "fare"]]
z = zscore(nums_tgt)
z.to_csv(path_clns+"num_z.csv")

In [31]:
z = zscore(np.log1p(nums_tgt))
z.to_csv(path_clns+"num_logz.csv")