In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
print("Dimensions of train: {}".format(train.shape))
print("Dimensions of test: {}".format(test.shape))

In [None]:
train.head()

In [None]:
print(train.columns.values)

In [None]:
train.isnull().sum()

In [None]:
data = train
data['Died']= 1 - data['Survived']

In [None]:
data.groupby('Sex').agg('sum')[['Survived','Died']].plot(kind='bar',stacked=True)

In [None]:
sns.violinplot(x='Sex', y='Age', hue='Survived',data=data,split=True)

In [None]:
figure = plt.figure(figsize=(32,16))
plt.hist([data[data['Survived'] == 1]['Fare'], data[data['Survived'] == 0]['Fare']], 
         stacked=True,
         bins = 50, label = ['Survived','Dead'])
plt.xlabel('Fare')
plt.ylabel('Number of Passengers')
plt.legend();

In [None]:
data.hist(bins=15, color='steelblue', edgecolor='black', linewidth=1.0, xlabelsize=8, ylabelsize=8, grid=False)    
plt.tight_layout(rect=(0, 0, 1.2, 1.2))   

In [None]:
train["Age"].describe()

In [None]:
class_pivot = data.pivot_table(index="Pclass",values="Survived")
class_pivot.plot.bar()
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def loaddf(filename):
    df = pd.read_csv(filename).drop(['Cabin','Ticket','Name'],axis=1) #drop stuff we don't use
    df = pd.concat([df, pd.get_dummies(df['Sex'])],axis=1).drop(['Sex'],axis=1)
    df = pd.concat([df, pd.get_dummies(df['Embarked'])],axis=1).drop(['Embarked'],axis=1)
    df['Fsize']=df['Parch']+df['SibSp']+1 #calculate family size
    return df

In [None]:
def cleanse(df, dropna=True):
    if dropna:
        df = df.dropna()
    else:
        df['Age']=df['Age'].fillna(df['Age'].mean())
        df['Fare']=df['Fare'].fillna(df['Fare'].mean())
        #df['Pclass']=df['Pclass'].fillna(3)
        df=df.fillna(0)
    return df

In [None]:
def load_X(df):
    train_X=df[['male','Pclass','Age','Fsize']]
    return train_X

In [None]:
def load_y(df):
    train_y=df['Survived']
    return train_y

In [None]:
train = cleanse(loaddf('train.csv'),dropna=False)
train_X, train_y = load_X(train),load_y(train)
model = DecisionTreeClassifier(max_depth=3)
model.fit(train_X, train_y)
pd.DataFrame([model.feature_importances_],columns=train_X.columns )

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(model, out_file='tree.dot', feature_names = train_X.columns.tolist(),class_names=['ตาย','รอด'], filled = True, rounded = True) #,
    
from IPython.display import Image
Image(filename = 'tree.png')