In [18]:
import pandas as pd
import numpy as np
from math import pi

from bokeh.palettes import RdYlGn6, RdYlGn9, Spectral10, Category20c

from bokeh.layouts import layout, row
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.models import BasicTicker, ColorBar, ColumnDataSource, LinearColorMapper, PrintfTickFormatter
from bokeh.transform import transform, cumsum

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import normalize

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



def match_group(value, quantiles):
    for n,qnt_value in enumerate(quantiles):
        if value < qnt_value:
            return n
    return len(quantiles)


def perc_survived_vis(df, feature_label):
    df['synt'] = 1
    surv = pd.pivot_table(df, index=feature_label, columns='Survived', values='synt', aggfunc={'synt': np.sum}).fillna(0)
    surv['perc'] = surv[1]/(surv[0]+surv[1])*100
    surv['total'] = surv[0]+surv[1]
    surv['angle'] = surv['total']/surv['total'].sum() * 2*pi
    surv.index.name = feature_label
        
    # Draw chart
    output_notebook()
    # Bar plot
    cats = [str(x) for x in surv.index]
    source = ColumnDataSource(data=dict(cats=cats, perc=surv.perc, legend=surv.index))
    p = figure(
        title="{} survived %".format(feature_label), 
        x_range=cats,
        toolbar_location=None,
        tools="hover", tooltips="@legend: @perc",
        plot_height=250, plot_width=250
    )
    p.vbar(x='cats', top='perc', width=0.9, source=source)
    p.xgrid.grid_line_color = None
    # Pie chart
    Category20c[1] = Category20c[3][:1]
    Category20c[2] = Category20c[3][:2]
    source1 = ColumnDataSource(data=dict(color=Category20c[len(surv)], 
                                         angle=surv.angle, 
                                         legend=surv.index, 
                                         total=surv.total))
    p1 = figure(
        title="{} distribution".format(feature_label),
        toolbar_location=None,
        x_range=(-0.5, 1.0),
        tools="hover", tooltips="@legend: @total",
        plot_height=250, plot_width=250)
    p1.wedge(x=0, y=1, radius=0.4,
            start_angle=cumsum('angle', include_zero=True), 
            end_angle=cumsum('angle'), line_color="white", 
            fill_color='color', legend='legend', source=source1)
    p1.axis.axis_label=None
    p1.axis.visible=False
    p1.grid.grid_line_color = None
    # show
    show(row([p, p1]))


def cv_score(model, x, y, cv):
    cvs = cross_val_score(model, x, y, cv=cv)
    print('score:',model.score(x, y))
    print('cross:', cvs)
    print('cross summary:', np.median(cvs), np.mean(cvs))
    

def vis_colormap(x, y, z, z_colormap=None, title='', x_label='', y_label=''):
    """
    """
    if not z_colormap:
        Category20c[1] = Category20c[3][:1]
        Category20c[2] = Category20c[3][:2]
        z_colormap = Category20c[len(np.unique(z))]
    
    colors = [z_colormap[val] for val in z]

    p = figure(title = title)
    p.xaxis.axis_label = x_label
    p.yaxis.axis_label = y_label
    p.circle(x, y, color=colors, fill_alpha=0.2, size=10)
    
    output_notebook()
    show(p)

In [35]:
df = pd.read_csv('datasets/titanic.csv')
print(df.columns)
print(df.shape)
print(df.isnull().sum())

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
(891, 12)
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [71]:
df['sex'] = df.Sex.apply(lambda x: 1 if x == 'male' else 0)

rnd_age = np.random.randint(df.Age.mean()-df.Age.std(), df.Age.mean()+df.Age.std(), size=df.Age.isnull().sum())
df['age'] = df.Age
df['age'][df.Age.isnull()] = rnd_age

df['age_na_median'] = df.Age
df['age_na_median'][df.Age.isnull()] = df.Age.median()


# rnd_fare = np.random.randint(df.Fare.mean()-df.Fare.std(), df.Fare.mean()+df.Fare.std(), size=df.Fare.isnull().sum())
# df['fare'] = df.Fare
# df['fare'][df.Fare.isnull()] = rnd_fare

# age_qnt = df['age_na_median'].quantile([0.25,0.50, 0.75,0.95, 0.99])
# print(age_qnt)
age_groups = {0:14, 1:21, 2:33, 3:55, 4:70}
df['age_group'] = df['age_na_median'].apply(lambda x: match_group(x, age_qnt))

fare_qnt = df['Fare'].quantile([0.25,0.50, 0.75,0.95, 0.99])
df['fare_group'] = df['Fare'].apply(lambda x: match_group(x, fare_qnt))

# df['agen'] =  normalize(np.array(df['Age']).reshape(-1,1), axis=0)
# df['faren'] =  normalize(np.array(df['Fare']).reshape(-1,1), axis=0)

enc_embarked  = LabelEncoder()
df['embarked']  =  enc_embarked.fit_transform(df['Embarked'].fillna('S'))

enc_cabin  = LabelEncoder()
df['cabin_le']  =  enc_cabin.fit_transform(df['Cabin'].fillna('undefined'))
df['have_cabin'] = df['Cabin'].fillna(0).apply(lambda c: 0 if c == 0 else 1)

df['synt'] = 1
df['id'] = df.PassengerId
print(df.head())

##engineer the family size feature
df['family_size'] = df['SibSp'] + df['Parch'] + 1
### new try 

# Create new feature IsAlone from FamilySize
df['is_alone'] = 0
df.loc[df['family_size'] == 1, 'is_alone'] = 1


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare      ...       age_group fare_group  \
0      0         A/5 21171   7.2500      ...               1          0   
1      0          PC 17599  71.2833      ...               3          3   
2      0  STON/O2. 3101282   7.9250      ...               1          1   
3      0            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [73]:
# df[['Embarked','id']].groupby(by='Embarked').nunique()
print(df['Age'].isnull().sum())
print(df['age'].isnull().sum())

177
0


In [72]:
perc_survived_vis(df, 'Sex')
perc_survived_vis(df, 'age_group')
perc_survived_vis(df, 'fare_group')
perc_survived_vis(df, 'Pclass')
perc_survived_vis(df, 'Parch')
perc_survived_vis(df, 'SibSp')
perc_survived_vis(df, 'family_size')
perc_survived_vis(df, 'is_alone')
perc_survived_vis(df, 'Embarked')
perc_survived_vis(df, 'have_cabin')

In [62]:
# create design matrix X and target vector y
X = np.array(df[['sex','age_group','fare_group','Pclass','is_alone','embarked','have_cabin']])
y = np.array(df['Survived'])
print(X.shape)
print(y.shape)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf = SVC()
clf = clf.fit(X_train, y_train)
cv_score(clf, X_test, y_test, 10)

(891, 7)
(891,)
score: 0.8203389830508474
cross: [0.8        0.9        0.76666667 0.76666667 0.7        0.82758621
 0.79310345 0.86206897 0.89655172 0.86206897]
cross summary: 0.8137931034482759 0.8174712643678161


In [66]:
X = np.array(df[['sex','age_group','fare_group','Pclass','is_alone','embarked','have_cabin']])
y = np.array(df['Survived'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
lr = LogisticRegression(solver='lbfgs', max_iter=100, multi_class='multinomial', class_weight="balanced")
lr.fit(X_train, y_train)
cv_score(lr, X_test, y_test, 21)

score: 0.7932203389830509
cross: [0.8        0.8        0.66666667 1.         0.8        0.8
 0.73333333 0.78571429 0.64285714 0.64285714 0.78571429 0.85714286
 0.71428571 0.71428571 0.85714286 0.84615385 0.92307692 0.84615385
 0.76923077 0.92307692 0.84615385]
cross summary: 0.8 0.7978021978021979


In [68]:
X = np.array(df[['sex','age_group','fare_group','Pclass','is_alone','embarked','have_cabin']])
y = np.array(df['Survived'])
dt = DecisionTreeClassifier(class_weight="balanced")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
dt.fit(X_train, y_train)
cv_score(dt, X_test, y_test, 10)

score: 0.7728813559322034
cross: [0.73333333 0.66666667 0.7        0.63333333 0.76666667 0.83333333
 0.66666667 0.79310345 0.82142857 0.82142857]
cross summary: 0.75 0.7435960591133004


In [69]:
colormap = {0:'red', 1:'green'}
vis_colormap(df['Age'], df['Fare'], df['Survived'], z_colormap=colormap, x_label='Age', y_label='Fare')
vis_colormap(df['Age'], df['have_cabin'], df['Survived'], z_colormap=colormap, x_label='Age', y_label='Cabin')
vis_colormap(df['Parch'], df['SibSp'], df['Survived'], z_colormap=colormap, x_label='Parch', y_label='SibSp')