In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

In [2]:
# Read data from Titanic dataset.

data = pd.read_csv('titanic_train.csv')
data1 = pd.read_csv('titanic_test.csv')
data.dtypes.sample(12)

SibSp            int64
Embarked        object
Sex             object
Cabin           object
Pclass           int64
Ticket          object
Survived         int64
Parch            int64
Fare           float64
Name            object
PassengerId      int64
Age            float64
dtype: object

In [3]:
data1.dtypes.sample(11)

SibSp            int64
Name            object
Sex             object
Age            float64
Cabin           object
PassengerId      int64
Embarked        object
Ticket          object
Parch            int64
Pclass           int64
Fare           float64
dtype: object

# LabelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
data["Sex_Binary"] = lb_make.fit_transform(data["Sex"])
data[["Sex", "Sex_Binary"]].head(11)

Unnamed: 0,Sex,Sex_Binary
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1
5,male,1
6,male,1
7,male,1
8,female,0
9,female,0


In [5]:
data1["Sex_Binary"] = lb_make.fit_transform(data1["Sex"])
data1[["Sex", "Sex_Binary"]].head(11)

Unnamed: 0,Sex,Sex_Binary
0,male,1
1,female,0
2,male,1
3,male,1
4,female,0
5,male,1
6,female,0
7,male,1
8,female,0
9,male,1


In [6]:
# The columns that are inputted into our model (and later used to make predictions) are called "features." 
# In our case, those would be the columns used to determine the home price. Sometimes, you will use all columns 
# except the target as features. Other times you'll be better off with fewer features. For now, we'll build a model with 
# only a few features. Later on you'll see how to iterate and compare models built with different features.
y = data.Survived
titanic_features = ["Pclass","Age","SibSp","Parch","Fare","Sex_Binary"]
X = data[titanic_features]
X1 = data1[titanic_features]

In [7]:
# The scikit-learn library has a function train_test_split to break up the data into two pieces. We'll use some of that data 
# as training data to fit the model, and we'll use the other data as validation data to calculate mean_absolute_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

# split data into training and validation data, for both features and target. The split is based on a random number generator. 
# Supplying a numeric value to the random_state argument guarantees we get the same split every time we run this script.
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state = 0)
# Define model
titanic_model_1 = xgb.XGBClassifier()
# Fit model
titanic_model_1.fit(train_X, train_y)

# get predicted prices on validation data
score_y = titanic_model_1.predict(test_X)

In [8]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(test_y, score_y)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

Average precision-recall score: 0.70


In [9]:
# get predicted prices on validation data
score_y1 = titanic_model_1.predict(X1)

In [10]:
# Kaggle needs the submission to have a certain format;
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# for an example of what it's supposed to look like.
submission = pd.DataFrame({ 'PassengerId': data1['PassengerId'],
                            'Survived': score_y1 })
submission.to_csv("submission.csv", index=False)

In [12]:
pwd

'C:\\Anaconda3'

In [11]:
print(submission)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
5            897         0
6            898         0
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         1
19           911         0
20           912         0
21           913         1
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         0
391         1283         1
392         1284         1
3