In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer

In [2]:
training_file = './input/train.csv'
training_data = pd.read_csv(training_file)

In [3]:
training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
y = training_data.Survived

In [5]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [6]:
# I'm going to put my data adjustments here.  I'll need to use this on both the training and test data.
def adjust_data(td):
    # I'm using a convention that derived features start with lowercase letter.
    td['male'] = (td['Sex'] == 'male') * 1.0
    td['female'] = (td['Sex'] == 'female') * 1.0
    td['cherbourg'] = (td['Embarked'] == 'C') * 1.0
    td['queenstown'] = (td['Embarked'] == 'Q') * 1.0
    td['southampton'] = (td['Embarked'] == 'S') * 1.0
    td.Age.fillna(td.Age.mean(), inplace=True)
    td.Fare.fillna(td.Fare.mean(), inplace=True)
    

In [7]:
adjust_data(training_data)
training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,female,cherbourg,queenstown,southampton
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0,0.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.0,1.0,1.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.0,1.0,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,1.0,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1.0,0.0,0.0,0.0,1.0


In [8]:
training_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
male             0
female           0
cherbourg        0
queenstown       0
southampton      0
dtype: int64

In [9]:
# declare features to be used in model
features = ['PassengerId', 'Pclass', 'SibSp', 'Parch', 'Age', 'Fare', 'male', 
            'female', 'cherbourg', 'queenstown', 'southampton']
# Age Fare excluded for now due to NaN
X = training_data[features]
print('training size', training_data.shape)
print('filtered size', X.shape)
X.head()

training size (891, 17)
filtered size (891, 11)


Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Age,Fare,male,female,cherbourg,queenstown,southampton
0,1,3,1,0,22.0,7.25,1.0,0.0,0.0,0.0,1.0
1,2,1,1,0,38.0,71.2833,0.0,1.0,1.0,0.0,0.0
2,3,3,0,0,26.0,7.925,0.0,1.0,0.0,0.0,1.0
3,4,1,1,0,35.0,53.1,0.0,1.0,0.0,0.0,1.0
4,5,3,0,0,35.0,8.05,1.0,0.0,0.0,0.0,1.0


In [10]:
X.isnull().sum()

PassengerId    0
Pclass         0
SibSp          0
Parch          0
Age            0
Fare           0
male           0
female         0
cherbourg      0
queenstown     0
southampton    0
dtype: int64

In [11]:
# transform data to gaussian normalized distribution
transformer = PowerTransformer()
X_norm = transformer.fit_transform(X)
X_norm[0:5]

array([[-2.14814111,  0.86397329,  1.37363642, -0.56025296, -0.55626693,
        -0.87882001,  0.73769513, -0.73769513, -0.48204268, -0.30756234,
         0.61930636],
       [-2.13183003, -1.45912855,  1.37363642, -0.56025296,  0.65726854,
         1.33665056, -1.35557354,  1.35557354,  2.0745051 , -0.30756234,
        -1.61470971],
       [-2.117043  ,  0.86397329, -0.67984996, -0.56025296, -0.24157742,
        -0.79006485, -1.35557354,  1.35557354, -0.48204268, -0.30756234,
         0.61930636],
       [-2.10329488, -1.45912855,  1.37363642, -0.56025296,  0.43792346,
         1.06735243, -1.35557354,  1.35557354, -0.48204268, -0.30756234,
         0.61930636],
       [-2.09032172,  0.86397329, -0.67984996, -0.56025296,  0.43792346,
        -0.77443897,  0.73769513, -0.73769513, -0.48204268, -0.30756234,
         0.61930636]])

In [12]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X_norm, y, random_state=1)

In [13]:
model = LogisticRegression(tol=1e-5, random_state=1, solver='liblinear', max_iter=200, verbose=1)

In [14]:
model.fit(train_X, train_y)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='liblinear',
          tol=1e-05, verbose=1, warm_start=False)

In [15]:
predictions = model.predict(val_X)

In [16]:
error = mean_squared_error(predictions, val_y)
print('Error is ', error)

Error is  0.21524663677130046


In [25]:
# For first submission without age or fare, error is 0.22869955156950672
# Second submission using average values for missing data in Fare and Age, error is 0.2062780269058296.  However, this performed worse on the test data set.
# Third submission transforms data before processing.  error is 0.21524663677130046.  This solution improved ranking by 772 spots

In [18]:
correct = (val_y == predictions).count()
print('Correct predictions ', correct, ' out of ', predictions.size)

Correct predictions  223  out of  223


In [19]:
# now do the test predictions
test_file = './input/test.csv'
test_data = pd.read_csv(test_file)
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [20]:
adjust_data(test_data)
test_X = test_data[features]
test_X_norm = transformer.fit_transform(test_X)
test_X_norm[0:5]

array([[-1.75656045,  0.9068881 , -0.68926091, -0.53844702,  0.3714598 ,
        -0.90470822,  0.75592895, -0.75592895, -0.56814154,  2.84375747,
        -1.35067551],
       [-1.74783791,  0.9068881 ,  1.39205306, -0.53844702,  1.29336659,
        -1.0439114 , -1.32287566,  1.32287566, -0.56814154, -0.35164743,
         0.74037028],
       [-1.73911782, -0.4895492 , -0.68926091, -0.53844702,  2.33617511,
        -0.6458419 ,  0.75592895, -0.75592895, -0.56814154,  2.84375747,
        -1.35067551],
       [-1.73040017,  0.9068881 , -0.68926091, -0.53844702, -0.21372555,
        -0.78074722,  0.75592895, -0.75592895, -0.56814154, -0.35164743,
         0.74037028],
       [-1.72168495,  0.9068881 ,  1.39205306,  1.80663014, -0.62188168,
        -0.36721547, -1.32287566,  1.32287566, -0.56814154, -0.35164743,
         0.74037028]])

In [21]:
test_y = model.predict(test_X_norm)

In [22]:
output = pd.DataFrame({'PassengerId': test_X.PassengerId,
                       'Survived': test_y})
output.to_csv('submission.csv', index=False)

In [24]:
output[0:10]

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
