# Kaggle Titanic Competition Model Iteration 1

Before everything else, imports!

In [1]:
import pandas
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

First, let's load in all of the data.

In [2]:
titanic = pandas.read_csv("train.csv")

titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Next, let's clean up the dataset by doing a few things:

1. Replacing all NaN values in the 'Age' column with the median age
2. Converting all desired non-numeric columns to numeric values - 'Sex' and 'Embarked'

In [3]:
# Replace 'Age' NaN with median age
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

# Replace 'male' 'Sex' with 0, 'female' 'Sex' with 1
titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0
titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1

# Replace 'Embarked' NaN with most popular 'Embarked', 'S'
titanic['Embarked'] = titanic['Embarked'].fillna('S')

# Replace 'Embarked' values with numeric values 0-2
titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0
titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1
titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2

Time to do some cross-validation and make some predictions!

In [4]:
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

alg = LinearRegression()


predictions = []
for train,test in kf:
    train_predictors = (titanic[predictors].iloc[train,:])
    train_target = titanic['Survived'].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)

Now let's map the predictions to actual outcomes on the border of 0.5

In [5]:
predictions = np.concatenate(predictions, axis=0)

predictions[predictions > .5] = 1
predictions[predictions < .5] = 0

accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions)

accuracy



0.78338945005611671

Logistic regression time!

In [6]:
alg = LogisticRegression(random_state=1)

scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)

print scores.mean()

0.787878787879


Now it's time to do some cleaning on the test set as well!

In [7]:
titanic_test = pandas.read_csv("test.csv")

titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())

titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

titanic_test["Embarked"].fillna('S')

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

Generating a submission file!

In [8]:
alg = LogisticRegression(random_state=1)

alg.fit(titanic[predictors], titanic['Survived'])

predictions = alg.predict(titanic_test[predictors])

submission = pandas.DataFrame({
        'PassengerId': titanic_test['PassengerId'],
        'Survived': predictions
    })

submission.to_csv('kaggleOriginal.csv', index=False)

My initial submission received a score of 0.75120


Time to do better!

A couple of things I want to try are:

- Altering the 'Embarked' variable by splitting it into three columns of 1 or 0
- This may take more work, but I'm also interested in trying out a random forest - seems like there's some talk on Kaggle about this model being effective.

To start off, I'll make three separate binary categories for the three 'Embarked' options.

In [9]:
titanic.loc[titanic["Embarked"] == 0, "Embarked_S"] = 1
titanic["Embarked_S"] = titanic["Embarked_S"].fillna(0)

titanic.loc[titanic["Embarked"] == 1, "Embarked_C"] = 1
titanic["Embarked_C"] = titanic["Embarked_C"].fillna(0)

titanic.loc[titanic["Embarked"] == 1, "Embarked_Q"] = 1
titanic["Embarked_Q"] = titanic["Embarked_Q"].fillna(0)

Now let's get onto some modeling! I'll repeat the same processes above, but with different predictors now

In [10]:
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_S', 'Embarked_C', 'Embarked_Q']

alg = LogisticRegression(random_state=1)

scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3)

print scores.mean()

0.785634118967


Hmm.. Not super promising, but I'll make a submission anyway! First, I need to make the same alterations to the test_set as I did to the train set.

In [11]:
titanic_test = pandas.read_csv("test.csv")

titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())

titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

titanic_test["Embarked"].fillna('S')

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

titanic_test.loc[titanic_test["Embarked"] == 0, "Embarked_S"] = 1
titanic_test["Embarked_S"] = titanic_test["Embarked_S"].fillna(0)

titanic_test.loc[titanic_test["Embarked"] == 1, "Embarked_C"] = 1
titanic_test["Embarked_C"] = titanic_test["Embarked_C"].fillna(0)

titanic_test.loc[titanic_test["Embarked"] == 1, "Embarked_Q"] = 1
titanic_test["Embarked_Q"] = titanic_test["Embarked_Q"].fillna(0)

Now that that's done, submission time! Not sure how this will go.

In [12]:
alg = LogisticRegression(random_state=1)

alg.fit(titanic[predictors], titanic['Survived'])

predictions = alg.predict(titanic_test[predictors])

submission = pandas.DataFrame({
        'PassengerId': titanic_test['PassengerId'],
        'Survived': predictions
    })

submission.to_csv('kaggle.csv', index=False)

I'm going to try something new - adding another predictor that will combine into a new 'family' column. I'm going to drop my strategy I used last time with the splitting of the embarkation category, because that clearly didn't work very well. 

In [13]:
titanic = pandas.read_csv("train.csv")

titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
# Replace all the occurences of male with the number 0.
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0

# Replace all the occurences of female with the number 1.
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

titanic["Embarked"] = titanic["Embarked"].fillna("S")

titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

titanic["Family"] = titanic["SibSp"] + titanic["Parch"] +1

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Family"]

# Initialize our algorithm
alg = LogisticRegression(random_state=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print scores.mean()

0.791245791246


Hmm, looks like it's marginally better than before. Good I suppose. Time to submit the hell out of this.

In [14]:
titanic_test = pandas.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())
# Replace all the occurences of male with the number 0.
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0

# Replace all the occurences of female with the number 1.
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

titanic_test["Family"] = titanic_test["SibSp"] + titanic_test["Parch"] +1

# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Family"]

# Initialize the algorithm class
alg = LogisticRegression(random_state=1)

# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle.csv", index=False)

Despite the improvement in the training, this actually produced a worse score - 0.74163 - not sure what made this happen.

The last thing I'd like to try is a random forest - I'm hoping this model will produce better results, though I'm not sure how much it'll actually improve. 

In [23]:
# Probably should have done something like this much earlier, but here's a generic dataframe cleaner

from sklearn.ensemble import RandomForestClassifier


def df_cleaner(titanic):
    
    titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median())
    # Replace all the occurences of male with the number 0.
    titanic.loc[titanic["Sex"] == "male", "Sex"] = 0

    # Replace all the occurences of female with the number 1.
    titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

    titanic["Embarked"] = titanic["Embarked"].fillna("S")

    titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
    titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
    titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

    return titanic

 # Read in the training data.
train = pandas.read_csv('train.csv')

# Set sampling.
sampling = .5

# Clean it up.
# Remove unused columns, clean age, and convert gender to binary column.
train.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
train = df_cleaner(train)

predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

alg = RandomForestClassifier(random_state=1, n_estimators=300, min_samples_split=4, min_samples_leaf=2)

scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

print "Score: ", scores.mean()

Score:  0.821548821549


Looking a little better! Now let's do some submitting! Woo!

Upon submission, I seem to have stumbled upon a really strange error that I can't figure out - all submissions are now receiving the same score of .71463 I don't know what to about this, and I can't figure out for the life of me what's causing this issue. Frustrating

In [24]:
# Read in the test data.
testdata = pandas.read_csv('test.csv')

test = testdata.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1)
test = df_cleaner(test)

output = alg.predict(test)

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": testdata["PassengerId"],
        "Survived": output
    })
submission.to_csv("kaggleForestWHY.csv", index=False)

NotFittedError: Estimator not fitted, call `fit` before exploiting the model.