# Titanic Kaggle Competition

In [46]:
# Import the Pandas library
import pandas as pd

# Import the numpy library
import numpy as np

# Import 'tree' from scikit-learn library
from sklearn import tree


In [47]:
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)


test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

In [48]:
#Print the `head` of the train and test dataframes
print(train.head())
print(test.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [49]:
# Passengers that survived vs passengers that passed away
print(train["Survived"].value_counts())

# As proportions
print(train["Survived"].value_counts(normalize = True))

# Males that survived vs males that passed away
print(train["Survived"][train["Sex"] == 'male'].value_counts())

# Females that survived vs Females that passed away
print(train["Survived"][train["Sex"] == 'female'].value_counts())

# Normalized male survival
print(train["Survived"][train["Sex"] == 'male'].value_counts(normalize=True))

# Normalized female survival
print(train["Survived"][train["Sex"] == 'female'].value_counts(normalize=True))

0    549
1    342
Name: Survived, dtype: int64
0    0.616162
1    0.383838
Name: Survived, dtype: float64
0    468
1    109
Name: Survived, dtype: int64
1    233
0     81
Name: Survived, dtype: int64
0    0.811092
1    0.188908
Name: Survived, dtype: float64
1    0.742038
0    0.257962
Name: Survived, dtype: float64


In [50]:
train["Age"]=train["Age"].fillna(train["Age"].mean())

# Create the column Child and assign 0 if >=18, or 1 if <18
train["Child"] = np.where(train["Age"]>=18,0,1)

# Print normalized Survival Rates for passengers under 18
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))

# Print normalized Survival Rates for passengers 18 or older
print(train["Survived"][train["Child"] == 0].value_counts(normalize = True))


1    0.539823
0    0.460177
Name: Survived, dtype: float64
0    0.638817
1    0.361183
Name: Survived, dtype: float64


In [51]:
# Convert the male and female groups to integer form
train["Sex"] = train["Sex"].map({'female': 1, 'male': 0})

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna("S")


# Convert the Embarked classes to integer form
train["Embarked"] = train["Embarked"].map({"S": 0, "C": 1, "Q": 2})


In [52]:
# Create the target and features numpy arrays: target, features_one
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))

[ 0.1242047   0.31117584  0.27397618  0.29064329]
0.979797979798


In [53]:
# Convert the male and female groups to integer form
test["Sex"] = test["Sex"].map({'female': 1, 'male': 0})

# Impute the Embarked variable
test["Embarked"] = test["Embarked"].fillna("S")

# Convert the Embarked classes to integer form
test["Embarked"] = test["Embarked"].map({"S": 0, "C": 1, "Q": 2})

test["Age"]=test["Age"].fillna(test["Age"].mean())

test["Fare"]=test["Fare"].fillna(test["Fare"].mean())

# Create the column Child and assign 0 if >=18, or 1 if <18
test["Child"] = np.where(test["Age"]>=18,0,1)

# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values

# Make your prediction using the test set
my_prediction = my_tree_one.predict(test_features)

In [54]:
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
print(my_solution)
# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])


      Survived
892          0
893          0
894          1
895          1
896          1
897          0
898          0
899          0
900          1
901          0
902          0
903          0
904          1
905          1
906          1
907          1
908          0
909          1
910          1
911          0
912          0
913          1
914          1
915          1
916          1
917          0
918          1
919          1
920          1
921          0
...        ...
1280         0
1281         0
1282         0
1283         1
1284         0
1285         0
1286         0
1287         1
1288         0
1289         1
1290         0
1291         0
1292         1
1293         0
1294         1
1295         1
1296         0
1297         0
1298         0
1299         0
1300         0
1301         1
1302         0
1303         1
1304         0
1305         0
1306         1
1307         0
1308         0
1309         0

[418 rows x 1 columns]
