In [23]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import ensemble
%matplotlib inline  

In [7]:
titanic_df = pd.read_csv('train.csv',header=0)
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [9]:
# Lets fill up missing values for age
#  assign to a new varaible before that so the data is not lost
cleaned_df = titanic_df
cleaned_df['Age'] = cleaned_df['Age'].fillna(cleaned_df.Age.median())
# cleaned_df.head()
cleaned_df[cleaned_df['Age'] > 60][['Age','Sex','Pclass','Survived']].describe()

Unnamed: 0,Age,Pclass,Survived
count,22.0,22.0,22.0
mean,66.022727,1.590909,0.227273
std,5.024884,0.854071,0.428932
min,61.0,1.0,0.0
25%,62.0,1.0,0.0
50%,64.5,1.0,0.0
75%,70.0,2.0,0.0
max,80.0,3.0,1.0


In [10]:
# we can see that passengers above 60 only 22% of people survived from the crash. 
#Just exploring how many people are there for each class
for i in range(1,4):
    print (i, ' male ' , len(cleaned_df[ (cleaned_df['Sex'] == 'male') & (cleaned_df['Pclass'] == i) ]))
    print (i, 'female' , len(cleaned_df[ (cleaned_df['Sex'] == 'female') & (cleaned_df['Pclass'] == i) ]))

(1, ' male ', 122)
(1, 'female', 94)
(2, ' male ', 108)
(2, 'female', 76)
(3, ' male ', 347)
(3, 'female', 144)


In [11]:
# Lets also fix the embarked 
for i in ['S','C','Q']:
    print (i, len(cleaned_df[cleaned_df['Embarked'] == i]))

('S', 644)
('C', 168)
('Q', 77)


In [12]:
# we can see that we have maximum 'S' so let fill in with 'S' for those missing 2 values 
cleaned_df['Embarked'] = cleaned_df['Embarked'].fillna('S')

In [13]:
# Lets work on gender now , lets see the proportion of males survived vs female
total_male = len(cleaned_df[(cleaned_df['Sex'] == 'male')])
total_female = len(cleaned_df[(cleaned_df['Sex'] == 'female')])
num_males_survived = len(cleaned_df[(cleaned_df['Sex'] == 'male') & cleaned_df['Survived'] == 1])
num_females_survived = len(cleaned_df[(cleaned_df['Sex'] != 'male') & cleaned_df['Survived'] == 1])
print (num_males_survived/float(total_male) * 100 ,'% of males survived')
print (num_females_survived/float(total_female) * 100 ,'% of females survived')

(18.890814558058924, '% of males survived')
(74.20382165605095, '% of females survived')


In [14]:
# as it is hard to work on string data in ML lets convert the 'sex' to 'gender' and have values 0,1 for m and f
cleaned_df['Gender'] = cleaned_df['Sex'].map({'female':0, 'male':1}).astype(int)

In [15]:
# now lets cleanup the parch (parent and children) and siblings 
cleaned_df['Family'] = cleaned_df['Parch'] + cleaned_df['SibSp']

In [16]:
# Lets display all datatypes that are not good for machine learning, like string/objects
cleaned_df.dtypes[cleaned_df.dtypes.map(lambda x: x== 'object')]

Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object

In [17]:
# As they dont add any value we can drop them to create our train_data (training data)
train_data = cleaned_df.drop(['Name','Sex','Ticket','Cabin','Embarked'],axis=1)
# we can also drop SibSp,Parch as they are part of Family now
train_data = train_data.drop(['SibSp','Parch'],axis=1)

In [18]:
# Let us also prepare the test data in similar format as we did for train_data
titanic_test_df = pd.read_csv('test.csv',header=0)
titanic_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [19]:
# we do have 418 valies of which only 332 values are available for age , 
# we can also merge SibSp and Parch and change the Sex to gender etc.. as we did for our train data
# infact let us create a function to reuse the same for any kind of data
def clean_up_df(df):
    """ This function will cleanup Age(Median), Sex(Change to 0,1), SibSp,Parch(Merge to Family), Embarked data
    Update to 'S' And Also deletes Name,Cabin details from titanic DF, Ensure to Pass DataFrame to this Function"""
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Gender'] = df['Sex'].map({'female':0, 'male':1}).astype(int)
    df['Family'] = df['Parch'] + df['SibSp']
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df = df.drop(['SibSp','Parch','Sex','Name','Cabin','Embarked','Ticket'],axis=1)
    return df
# gender 

In [20]:
test_df = clean_up_df(titanic_test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Age            418 non-null float64
Fare           418 non-null float64
Gender         418 non-null int64
Family         418 non-null int64
dtypes: float64(2), int64(4)
memory usage: 19.7 KB


In [24]:
# Logistic Regression 
logistic = linear_model.LogisticRegression()
X = train_data.drop(['PassengerId','Survived'],axis=1)
y = train_data['Survived']
logistic.fit(X,y)
logistic.score(X, y)

0.80134680134680136

In [26]:
X_test = test_df.drop(['PassengerId'],axis=1)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 5 columns):
Pclass    418 non-null int64
Age       418 non-null float64
Fare      418 non-null float64
Gender    418 non-null int64
Family    418 non-null int64
dtypes: float64(2), int64(3)
memory usage: 16.4 KB


In [27]:
y_pred = logistic.predict(X_test)

In [28]:
# Random Forests 
random_forest = ensemble.RandomForestClassifier(n_estimators=100)
random_forest.fit(X,y)
y_pred = random_forest.predict(X_test)
random_forest.score(X,y)

0.97979797979797978

In [29]:
submission = pd.DataFrame({
        "PassengerId":test_df['PassengerId'],
        "Survived":y_pred
    })
submission.to_csv('titanic.csv',index=False)