In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation



In [2]:
input_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
# Merging the two data frames for better account on missing values and feature engineering
df = pd.concat([input_df, test_df])
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df = df.reindex_axis(input_df.columns, axis=1)
df.shape
df.columns
#deal with missing values
df['Cabin'].fillna('U0',inplace=True)
fare_median = df['Fare'].dropna().median()
df['Fare'].fillna(fare_median, inplace=True)
embarked_mode = df['Embarked'].dropna().mode().values[0]
df['Embarked'].fillna(embarked_mode, inplace=True)
#df.columns[pd.isnull(df).any()]


#transform to numeric data as sklearn accepts only numeric data
import re

df['CabinLetter'] = df['Cabin'].map(lambda x : re.compile("([a-zA-Z]+)").search(x).group())
df['CabinLetter'] = pd.factorize(df['CabinLetter'])[0]

#dividing into quartiles
df['FareBin'] = pd.factorize(pd.qcut(df['Fare'], 4))[0]

#derived Variables
df['Names'] = df['Name'].map(lambda x : len(re.split(' ',x)))

# What is each person's title? 
df['Title'] = df['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
 
# Group low-occuring, related titles together
df['Title'][df.Title == 'Jonkheer'] = 'Master'
df['Title'][df.Title.isin(['Ms','Mlle'])] = 'Miss'
df['Title'][df.Title == 'Mme'] = 'Mrs'
df['Title'][df.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir'
df['Title'][df.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'



def getTicketPrefix(ticket):
    match = re.compile("([a-zA-Z\.\/]+)").search(ticket)
    if match:
        return match.group()
    else:
        return 'U'
 
def getTicketNumber(ticket):
    match = re.compile("([\d]+$)").search(ticket)
    if match:
        return match.group()
    else:
        return '0'
    
    
df['TicketPrefix'] = df['Ticket'].map( lambda x : getTicketPrefix(x.upper()))
df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('[\.?\/?]', '', x) )
df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('STON', 'SOTON', x) )

df['TicketPrefixId'] = pd.factorize(df['TicketPrefix'])[0]

# extract the ticket number
df['TicketNumber'] = df['Ticket'].map( lambda x: getTicketNumber(x) )

# create a feature for the number of digits in the ticket number
df['TicketNumberDigits'] = df['TicketNumber'].map( lambda x: len(x) ).astype(np.int)

# create a feature for the starting number of the ticket number
df['TicketNumberStart'] = df['TicketNumber'].map( lambda x: x[0:1] ).astype(np.int)

# The prefix and (probably) number themselves aren't useful
df.drop(['TicketPrefix', 'TicketNumber'], axis=1, inplace=True)


df['Embarked'] = pd.factorize(df['Embarked'])[0]
df['Title'] = pd.factorize(df['Title'])[0]

from sklearn import linear_model

age_df = df[['Age','Pclass', 'SibSp', 'Parch', 'Fare', 'CabinLetter', 'Embarked', 'Title','Names']]
known_age = age_df.loc[df.Age.notnull()]
unknown_age = age_df.loc[df.Age.isnull()]
x = known_age.values[:,1::]
y = known_age.values[:,0]

regr =linear_model.LinearRegression()
regr.fit(x,y)

predicted_ages = regr.predict(unknown_age.values[:,1::])
df.loc[df.Age.isnull(), 'Age'] = predicted_ages
df["FamilySize"] = df["SibSp"]+ df["Parch"]

df.columns[pd.isnull(df).any()]
df['FamilyName'] = df['Name'].map(lambda x: re.compile("(.*),.*").search(x).group(1))
df['FamilyID'] = df['FamilySize'].astype(str) + df['FamilyName']
df[['Name', 'FamilyName','FamilySize','FamilyID']]

#df[df.FamilySize <=2]['FamilyID'] = 'Small'

df.loc[df.FamilySize <=3, 'FamilyID'] = 'Small'
df.columns
df['Sex'] = pd.factorize(df['Sex'])[0]
df['FamilyID'] = pd.factorize(df['FamilyID'])[0]
df.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age', u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked', u'CabinLetter', u'FareBin', u'Names', u'Title', u'TicketPrefixId', u'TicketNumberDigits', u'TicketNumberStart', u'FamilySize', u'FamilyName', u'FamilyID'], dtype='object')

In [3]:
all = df[['Survived', u'Pclass', u'Sex', u'Age', 'SibSp','Parch','Fare','Embarked',u'CabinLetter', u'FareBin', u'Title', u'TicketPrefixId', u'FamilySize', 'FamilyID']]
all.columns[pd.isnull(all).any()]
train = all[all.Survived.notnull()]
train
x = train.values[:,1::]
y = train.values[:,0]

# max_score = 0
# max_estimators = 0
# for i in range(700,1001,100):
#     alg = RandomForestClassifier(n_estimators = i, max_features = None, warm_start = True)
#     scores = cross_validation.cross_val_score(alg,x,y,cv=10)
#     print i,"The cross validation accuracy is ", scores.mean()
#     if scores.mean() > max_score:
#         max_score = scores.mean()
#         max_estimators = i
# max_estimators

In [4]:
forest =RandomForestClassifier(n_estimators = 2000, max_features = 'sqrt', warm_start = True)
forest.fit(x,y)
test = all[all.Survived.isnull()]
test.drop('Survived', axis = 1, inplace=True)
p_ids = df[df.Survived.isnull()]['PassengerId']
p_ids
survived = forest.predict(test)
survived = [int(val) for val in survived]
import numpy as np
survived = np.asanyarray(survived)
ans = pd.DataFrame(data = p_ids)
ans["Survived"] = survived
ans.to_csv("rfmodel_TS.csv",index=False)



A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
imp = forest.feature_importances_
imp = 100.0 * (imp / imp.max())
important_idx = np.where(imp > 15)[0]
important_idx = [index+1 for index in important_idx]
important_idx
columns = all.columns[important_idx]
columns, important_idx

(Index([u'Pclass', u'Sex', u'Age', u'Fare', u'CabinLetter', u'FareBin', u'Title', u'TicketPrefixId', u'FamilySize'], dtype='object'),
 [1, 2, 3, 6, 8, 9, 10, 11, 12])