In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor

RANDOM_SEED = 43

In [2]:

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Reading The Data

In [3]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
train_data.shape

(891, 12)

In [4]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_data.shape

(418, 11)

In [5]:
gender_sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
gender_sub.shape

(418, 2)

## Cleaning The Data

### Dropping and standardising columns

In [6]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# We don't need name, Passenger Id and Ticket
train_data = train_data.drop(columns=['Name', 'Ticket'])
test_data = test_data.drop(columns=['Name', 'Ticket'])

In [8]:
print("Null Check:")
print("Gender Sub: ")
print(gender_sub.isnull().sum())

print()
print("Train: ")
print(train_data.isnull().sum())

print()
print("Test: ")
print(test_data.isnull().sum())



Null Check:
Gender Sub: 
PassengerId    0
Survived       0
dtype: int64

Train: 
PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Test: 
PassengerId      0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64


#### Standardizing columns

In [9]:
# Standardizing Sex, Embarked and Cabin
cols = ['Sex', 'Embarked', 'Cabin']
train_data[cols] = train_data[cols].apply(LabelEncoder().fit_transform)
test_data[cols] = test_data[cols].apply(LabelEncoder().fit_transform)


### Looks like age, cabin and embark are the data points that need to be cleaned.
##### We will use KNN to predict age and cabin in the train dataset to make the data whole.

In [10]:
# Filling embarked and cabin
# We will just forward fill the values for embark as only 2 are missing
train_data['Embarked'] = train_data['Embarked'].ffill()

# We will backward fill the values for cabin
train_data['Cabin'] = train_data['Cabin'].bfill()
train_data['Cabin'] = train_data['Cabin'].ffill()
print(train_data['Embarked'].isnull().sum())
print(train_data['Cabin'].isnull().sum())


0
0


In [11]:
# Filling in Age

with_age = train_data.dropna(subset=['Age'])
miss_age = train_data[train_data['Age'].isnull()]
miss_age.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
5,6,0,3,1,,0,0,8.4583,147,1
17,18,1,2,1,,0,0,13.0,147,2
19,20,1,3,0,,0,0,7.225,147,0
26,27,0,3,1,,0,0,7.225,147,0
28,29,1,3,0,,0,0,7.8792,147,1


In [12]:
# Setting age
x = with_age.drop(columns=['Age'])
y = with_age['Age']
# x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=RANDOM_SEED)
knn = KNeighborsRegressor(weights='distance')
knn.fit(x, y)
predicted_age = knn.predict(miss_age.drop(columns=['Age']))

miss_age['Age'] = predicted_age
miss_age['Age'][:].round(0)

final_df = pd.concat([with_age, miss_age], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  miss_age['Age'] = predicted_age


In [13]:
women = train_data.loc[train_data.Sex == 0]["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

men = train_data.loc[train_data.Sex == 1]["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of women who survived: 0.7420382165605095
% of men who survived: 0.18890814558058924


In [14]:
from sklearn.ensemble import RandomForestClassifier

y = final_df['Survived']

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(final_df[features])

X_test = pd.get_dummies(test_data[features])
Y_test = gender_sub['Survived']

model = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)
score = accuracy_score(Y_test, predictions)
    
print(f'Accuracy Score: {score}' )

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

print(output)

output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Accuracy Score: 0.9473684210526315
     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
Your submission was successfully saved!


In [15]:
def create_tree(train_data, test_data, alpha_val, max_depth):
    y = train_data['Survived']

    features = ["Pclass", "Sex", "SibSp", "Parch"]
    X = pd.get_dummies(train_data[features])

    X_test = pd.get_dummies(test_data[features])
    
    dt = tree.DecisionTreeClassifier(random_state=42, ccp_alpha=alpha_val, max_depth=max_depth)
    dt.fit(X, y)
    predictions = dt.predict(X_test)
    
    output = pd.DataFrame({'PassengerId' : test_data.PassengerId, 'Survived': predictions})
    
    score = accuracy_score(Y_test, predictions)
    
    print(f'Accuracy Score: {score}' )
    
    output.to_csv('submission.csv', index=False)
    print("Your submission was successfully saved!")


In [16]:
# create_tree(final_df, test_data, alpha_val=.1, max_depth=10)