In [None]:
# Import Matplotlib and allow plots to show in the Jupyter Notebook
import matplotlib.pyplot as plt
%matplotlib inline
# Import NumPy and Pandas
import numpy as np
import pandas as pd
import seaborn as sns
# Auto reloads notebook when changes are made
%reload_ext autoreload
%autoreload 2

In [None]:
def preparePercentageString(value, maxValue):
    percentage = '{:0.2f}'.format((value/ maxValue) * 100)
    return f'{percentage}% ({value})'

def draw_companions_impact_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    alone_and_survived = np.sum((df['Survived'] == 1) & (df['Companions'] == 0))
    alone_and_died = np.sum((df['Survived'] == 0) & (df['Companions'] == 0))
    
    accompanied_and_survived = np.sum((df['Survived'] == 1) & (df['Companions'] > 0))
    accompanied_and_died = np.sum((df['Survived'] == 0) & (df['Companions'] > 0))
    
    died = [alone_and_died, accompanied_and_died]
    survived = [alone_and_survived, accompanied_and_survived]
    
    spc = np.arange(2)
    plt.ylabel("Amount of people", fontsize=18)
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, survived, width=0.45, label='Survived', edgecolor='k')
    plt.bar(spc + 0.45, died, width=0.45, label='Died', edgecolor='k')
    plt.xticks(spc + 0.45/2, ( 'Alone', 'Accompanied'), fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    plt.title('Impact of Companions on Survival', fontsize=20)
    
    for i in range(len(died)):
        plt.annotate(preparePercentageString(died[i],row_amount), xy=(i + 0.45,died[i]), ha='center', va='bottom', fontsize=15)
        plt.annotate(preparePercentageString(survived[i],row_amount), xy=(i ,survived[i]), ha='center', va='bottom', fontsize=15)
        

In [None]:
draw_companions_impact_plot("train_data_results.csv")

In [None]:
draw_companions_impact_plot("test_data_results.csv")

In [None]:
def draw_entry_place_impact_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    cherbourg_and_survived = np.sum((df['Survived'] == 1) & (df['Embarked_C'] == 1))
    cherbourg_and_died = np.sum((df['Survived'] == 0) & (df['Embarked_C'] == 1))
        
    queenstown_and_survived = np.sum((df['Survived'] == 1) & (df['Embarked_Q'] == 1))
    queenstown_and_died = np.sum((df['Survived'] == 0) & (df['Embarked_Q'] == 1))
    
    southampton_and_survived = np.sum((df['Survived'] == 1) & (df['Embarked_S'] == 1))
    southampton_and_died = np.sum((df['Survived'] == 0) & (df['Embarked_S'] == 1))
    
    survived = [cherbourg_and_survived, queenstown_and_survived, southampton_and_survived]    
    died = [cherbourg_and_died, queenstown_and_died, southampton_and_died]
    
    spc = np.arange(3)
    plt.ylabel("Amount of people", fontsize=18)
    plt.title('Impact of Entry Place on Survival', fontsize=20)

    entry_places = ['Cherbourg', 'Queenstown', 'Southampton']
    X_Tick_List = []
    X_Tick_Label_List=[]
    
    for ep in range (0,len(entry_places)):
        X_Tick_List.append(ep+0.45/2)
        X_Tick_Label_List.append(entry_places[ep])
    
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, survived, width=0.45, label='Survived', edgecolor='k')
    plt.bar(spc + 0.45, died, width=0.45, label='Died', edgecolor='k')
    plt.xticks(X_Tick_List, X_Tick_Label_List, fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    
    for i in range(len(died)):
        plt.annotate(preparePercentageString(died[i], row_amount), xy=(i + 0.45,died[i]), ha='center', va='bottom', fontsize=15)
        plt.annotate(preparePercentageString(survived[i], row_amount), xy=(i ,survived[i]), ha='center', va='bottom', fontsize=15)


In [None]:
draw_entry_place_impact_plot("train_data_results.csv")

In [None]:
draw_entry_place_impact_plot("test_data_results.csv")

In [None]:
def draw_class_impact_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    one_and_survived = np.sum((df['Survived'] == 1) & (df['Pclass'] == 1))
    one_and_died = np.sum((df['Survived'] == 0) & (df['Pclass'] == 1))
     
    two_and_survived = np.sum((df['Survived'] == 1) & (df['Pclass'] == 2))
    two_and_died = np.sum((df['Survived'] == 0) & (df['Pclass'] == 2))
     
    three_and_survived = np.sum((df['Survived'] == 1) & (df['Pclass'] == 3))
    three_and_died = np.sum((df['Survived'] == 0) & (df['Pclass'] == 3))
    
    survived = [one_and_survived, two_and_survived, three_and_survived]    
    died = [one_and_died, two_and_died, three_and_died]
    
    spc = np.arange(3)
    plt.ylabel("Amount of people", fontsize=18)
    plt.title('Impact of Class on Survival', fontsize=20)

    classes = ['First(Upper)', 'Second(Middle)', 'Third(Lower)']
    X_Tick_List = []
    X_Tick_Label_List=[]
    
    for c in range (0,len(classes)):
        X_Tick_List.append(c+0.45/2)
        X_Tick_Label_List.append(classes[c])
    
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, survived, width=0.45, label='Survived', edgecolor='k')
    plt.bar(spc + 0.45, died, width=0.45, label='Died', edgecolor='k')
    plt.xticks(X_Tick_List, X_Tick_Label_List, fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    
    for i in range(len(died)):
        plt.annotate(preparePercentageString(died[i], row_amount), xy=(i + 0.45,died[i]), ha='center', va='bottom', fontsize=15)
        plt.annotate(preparePercentageString(survived[i], row_amount), xy=(i ,survived[i]), ha='center', va='bottom', fontsize=15)


In [None]:
draw_class_impact_plot("train_data_results.csv")

In [None]:
draw_class_impact_plot("test_data_results.csv")

In [None]:
def draw_sex_impact_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    men_survived= np.sum((df['Survived'] == 1) & (df['Male'] == 1)) 
    men_died = np.sum((df['Survived'] == 0) & (df['Male'] == 1))
       
    women_survived = np.sum((df['Survived'] == 1) & (df['Female'] == 1))
    women_died = np.sum((df['Survived'] == 0) & (df['Female'] == 1))
    
    died = [men_died, women_died]
    survived = [men_survived, women_survived]
    
    spc = np.arange(2)
    plt.ylabel("Amount of people", fontsize=18)
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, survived, width=0.45, label='Survived', edgecolor='k')
    plt.bar(spc + 0.45, died, width=0.45, label='Died', edgecolor='k')
    plt.xticks(spc + 0.45/2, ('Men', 'Women'), fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    plt.title('Impact of Sex on Survival', fontsize=20)
    
    for i in range(len(died)):
        plt.annotate(preparePercentageString(died[i],row_amount), xy=(i + 0.45,died[i]), ha='center', va='bottom', fontsize=15)
        plt.annotate(preparePercentageString(survived[i],row_amount), xy=(i ,survived[i]), ha='center', va='bottom', fontsize=15)
        

In [None]:
draw_sex_impact_plot("train_data_results.csv")

In [None]:
draw_sex_impact_plot("test_data_results.csv")

In [None]:
def draw_age_impact_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    minors_and_survived = np.sum((df['Survived'] == 1) & (df['Age'] >= 0) & (df['Age'] < 18))
    minors_and_died = np.sum((df['Survived'] == 0) & (df['Age'] >= 0) & (df['Age'] < 18))
    
    before_thirties_and_survived = np.sum((df['Survived'] == 1) & (df['Age'] >= 18) & (df['Age'] < 30))
    before_thirties_and_died = np.sum((df['Survived'] == 0) & (df['Age'] >= 18) & (df['Age'] < 30))
    
    before_forties_and_survived = np.sum((df['Survived'] == 1) & (df['Age'] >= 30) & (df['Age'] <40))
    before_forties_and_died = np.sum((df['Survived'] == 0) & (df['Age'] >= 30) & (df['Age'] <40))
    
    before_fifties_and_survived = np.sum((df['Survived'] == 1) & (df['Age'] >= 40) & (df['Age'] < 50))
    before_fifties_and_died = np.sum((df['Survived'] == 0) & (df['Age'] >= 40) & (df['Age'] < 50))
    
    before_sixties_and_survived = np.sum((df['Survived'] == 1) & (df['Age'] >= 50) & (df['Age'] < 60))
    before_sixties_and_died = np.sum((df['Survived'] == 0) & (df['Age'] >= 50) & (df['Age'] < 60))
    
    older_and_survived = np.sum((df['Survived'] == 1) & (df['Age'] >= 60))
    older_and_died = np.sum((df['Survived'] == 0) & (df['Age'] >= 60))
    
    survived = [minors_and_survived, before_thirties_and_survived, before_forties_and_survived,before_fifties_and_survived,before_sixties_and_survived, older_and_survived]
    died = [minors_and_died,before_thirties_and_died, before_forties_and_died,before_fifties_and_died,before_sixties_and_died,older_and_died]
    
    spc = np.arange(6)
    plt.ylabel("Amount of people", fontsize=18)
    plt.title('Impact of Age on Survival', fontsize=20)

    age_ranges = ['0-17', '18-29', '30-39', '40-49', '50-59', '60+']
    X_Tick_List = []
    X_Tick_Label_List=[]
    
    for ar in range (0,len(age_ranges)):
        X_Tick_List.append(ar+0.48/2)
        X_Tick_Label_List.append(age_ranges[ar])
    
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, survived, width=0.48, label='Survived', edgecolor='k')
    plt.bar(spc + 0.48, died, width=0.48, label='Died', edgecolor='k')
    plt.xticks(X_Tick_List, X_Tick_Label_List, fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    
    for i in range(len(died)):
        plt.annotate(preparePercentageString(died[i],row_amount), xy=(i + 0.48,died[i]), ha='center', va='bottom', fontsize=13)
        plt.annotate(preparePercentageString(survived[i],row_amount), xy=(i ,survived[i]), ha='center', va='bottom', fontsize=13)
        
    
    

In [None]:
draw_age_impact_plot("train_data_results.csv")

In [None]:
draw_age_impact_plot("test_data_results.csv")

In [None]:
def draw_fare_impact_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    under50_and_survived = np.sum((df['Survived'] == 1) & (df['Fare_(0, 50]'] ==1))
    under50_and_died = np.sum((df['Survived'] == 0) & (df['Fare_(0, 50]'] ==1))
   
    under100_and_survived = np.sum((df['Survived'] == 1) & (df['Fare_(50, 100]'] ==1))
    under100_and_died = np.sum((df['Survived'] == 0) & (df['Fare_(50, 100]'] ==1))  
    
    under200_and_survived = np.sum((df['Survived'] == 1) & (df['Fare_(100, 200]'] ==1))
    under200_and_died = np.sum((df['Survived'] == 0) & (df['Fare_(100, 200]'] ==1))
    
    under500_and_survived = np.sum((df['Survived'] == 1) & (df['Fare_(200, 500]'] ==1))
    under500_and_died = np.sum((df['Survived'] == 0) & (df['Fare_(200, 500]'] ==1))
    
    under1000_and_survived = np.sum((df['Survived'] == 1) & (df['Fare_(500, 1000]'] ==1))
    under1000_and_died = np.sum((df['Survived'] == 0) & (df['Fare_(500, 1000]'] ==1))
    
    survived = [under50_and_survived, under100_and_survived, under200_and_survived, under500_and_survived, under1000_and_survived]
    died =[under50_and_died, under100_and_died, under200_and_died, under500_and_died, under1000_and_died]
    
    spc = np.arange(5)
    plt.ylabel("Amount of people", fontsize=18)
    plt.title('Impact of Fare on Survival', fontsize=20)

    fare_ranges = ['(0-50]', '(50-100]', '(100-200]', '(200-500]', '[500-1000)']
    X_Tick_List = []
    X_Tick_Label_List=[]
    
    for ar in range (0,len(fare_ranges)):
        X_Tick_List.append(ar+0.45/2)
        X_Tick_Label_List.append(fare_ranges[ar])
    
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, survived, width=0.45, label='Survived', edgecolor='k')
    plt.bar(spc + 0.45, died, width=0.45, label='Died', edgecolor='k')
    plt.xticks(X_Tick_List, X_Tick_Label_List, fontsize=18)
    plt.legend(loc=1, prop={'size': 20})
    
    for i in range(len(died)):
        plt.annotate(preparePercentageString(died[i],row_amount), xy=(i + 0.45,died[i]), ha='center', va='bottom', fontsize=13)
        plt.annotate(preparePercentageString(survived[i],row_amount), xy=(i ,survived[i]), ha='center', va='bottom', fontsize=13)
    
    

In [None]:
draw_fare_impact_plot("train_data_results.csv")

In [None]:
draw_fare_impact_plot("test_data_results.csv")

In [None]:
def draw_heatmap(file_name):
    df = pd.read_csv(file_name)
    tempDf = pd.DataFrame()
    tempDf['Survived'] = df['Survived']
    tempDf['Pclass'] = df['Pclass']
    tempDf['Age'] = df['Age']
    tempDf['Companions'] = df['Companions']
    tempDf['Female'] = df['Female']
    tempDf['Male'] = df['Male']
    tempDf['Embarked_C'] = df['Embarked_C']
    tempDf['Embarked_Q'] = df['Embarked_Q']
    tempDf['Embarked_S'] = df['Embarked_S']
    tempDf['Fare_(0,50]'] = df['Fare_(0, 50]']
    tempDf['Fare_(50,100]'] = df['Fare_(50, 100]']
    tempDf['Fare_(100,200]'] = df['Fare_(100, 200]']
    tempDf['Fare_(200,500]'] = df['Fare_(200, 500]']
    tempDf['Fare_(500,1000]'] = df['Fare_(500, 1000]']
    plt.rcParams["figure.figsize"] = (18,8)
    sns.heatmap(tempDf.corr(), cmap='YlGnBu')

In [None]:
draw_heatmap("train_data_results.csv")

In [None]:
draw_heatmap("test_data_results.csv")

In [None]:
def draw_class_and_entry_place_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    cherbourg_and_first_class = np.sum((df['Pclass'] == 1) & (df['Embarked_C'] == 1))
    cherbourg_and_second_class = np.sum((df['Pclass'] == 2) & (df['Embarked_C'] == 1))
    cherbourg_and_third_class = np.sum((df['Pclass'] == 3) & (df['Embarked_C'] == 1))
    
    queenstown_and_first_class = np.sum((df['Pclass'] == 1) & (df['Embarked_Q'] == 1))
    queenstown_and_second_class = np.sum((df['Pclass'] == 2) & (df['Embarked_Q'] == 1))
    queenstown_and_third_class = np.sum((df['Pclass'] == 3) & (df['Embarked_Q'] == 1))
    
    southampton_and_first_class = np.sum((df['Pclass'] == 1) & (df['Embarked_S'] == 1))
    southampton_and_second_class = np.sum((df['Pclass'] == 2) & (df['Embarked_S'] == 1))
    southampton_and_third_class = np.sum((df['Pclass'] == 3) & (df['Embarked_S'] == 1))
       
    first_class = [cherbourg_and_first_class, queenstown_and_first_class, southampton_and_first_class]
    second_class = [cherbourg_and_second_class,queenstown_and_second_class ,southampton_and_second_class]
    third_class = [cherbourg_and_third_class, queenstown_and_third_class, southampton_and_third_class]
    
    X_Tick_List = []
    X_Tick_Label_List=[]
    
    entry_places = ['Cherbourhg', 'Queenstown', 'Southampton']
    
    for ep in range (0,len(entry_places)):
        X_Tick_List.append(ep+0.30/2)
        X_Tick_Label_List.append(entry_places[ep])
    
    spc = np.arange(3)
    plt.ylabel("Amount of people", fontsize=18)
    plt.title('Correlation between Entry point and Class', fontsize=20)
    
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, first_class, width=0.30, label='First class(Upper)', edgecolor='k')
    plt.bar(spc + 0.30, second_class, width=0.30, label='Second class(Middle)', edgecolor='k')
    plt.bar(spc + 0.60, third_class, width=0.30, label='Third class(Lower)', edgecolor='k')
    plt.xticks(X_Tick_List, X_Tick_Label_List, fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    
    for i in range(len(first_class)):
        plt.annotate(preparePercentageString(first_class[i],row_amount), xy=(i,first_class[i]), ha='center', va='bottom', fontsize=13)
        plt.annotate(preparePercentageString(second_class[i],row_amount), xy=(i+0.30,second_class[i]), ha='center', va='bottom', fontsize=13)
        plt.annotate(preparePercentageString(third_class[i],row_amount), xy=(i+0.60 ,third_class[i]), ha='center', va='bottom', fontsize=13)        

In [None]:
draw_class_and_entry_place_plot("train_data_results.csv")

In [None]:
draw_class_and_entry_place_plot("test_data_results.csv")

In [None]:
def draw_class_and_sex_plot(file_name):
    df = pd.read_csv(file_name)
    row_amount = len(df.index)
    
    cherbourg_and_women = np.sum((df['Female'] == 1) & (df['Embarked_Q'] == 1))
    cherbourg_and_men = np.sum((df['Male'] == 1) & (df['Embarked_Q'] == 1))
    
    queenstown_and_women = np.sum((df['Female'] == 1) & (df['Embarked_Q'] == 1))
    queenstown_and_men = np.sum((df['Male'] == 1) & (df['Embarked_Q'] == 1))
    
    southampton_and_women = np.sum((df['Female'] == 1) & (df['Embarked_S'] == 1))
    southampton_and_men = np.sum((df['Male'] == 1) & (df['Embarked_S'] == 1))
       
    women = [cherbourg_and_women, queenstown_and_women, southampton_and_women]
    men = [cherbourg_and_men,queenstown_and_men ,southampton_and_men]
    
    X_Tick_List = []
    X_Tick_Label_List=[]
    
    entry_places = ['Cherbourhg', 'Queenstown', 'Southampton']
    
    for ep in range (0,len(entry_places)):
        X_Tick_List.append(ep+0.45/2)
        X_Tick_Label_List.append(entry_places[ep])
    
    spc = np.arange(3)
    plt.ylabel("Amount of people", fontsize=18)
    plt.title('Correlation between Entry point and Sex', fontsize=20)
    
    plt.rcParams["figure.figsize"] = (18,8)
    plt.bar(spc, women, width=0.45, label='Women', edgecolor='k')
    plt.bar(spc + 0.45, men, width=0.45, label='Men', edgecolor='k')
    plt.xticks(X_Tick_List, X_Tick_Label_List, fontsize=18)
    plt.legend(loc=2, prop={'size': 20})
    
    for i in range(len(women)):
        plt.annotate(preparePercentageString(women[i],row_amount), xy=(i,women[i]), ha='center', va='bottom', fontsize=13)
        plt.annotate(preparePercentageString(men[i],row_amount), xy=(i+0.45,men[i]), ha='center', va='bottom', fontsize=13)


In [None]:
draw_class_and_sex_plot("train_data_results.csv")

In [None]:
draw_class_and_sex_plot("test_data_results.csv")