In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Acquire data

In [None]:
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')
train_df.head()

In [None]:
print(train_df.columns)

In [None]:



#  passing by reference is convenient, because we can clean both datasets at once
combine = [train_df,test_df]


Description of dataset:

survival:    Survival 
PassengerId: Unique Id of a passenger. 
pclass:    Ticket class     
sex:    Sex     
Age:    Age in years     
sibsp:    # of siblings / spouses aboard the Titanic     
parch:    # of parents / children aboard the Titanic     
ticket:    Ticket number     
fare:    Passenger fare     
cabin:    Cabin number     
embarked:    Port of Embarkation

# Analysing data by Pivoting

In [None]:
train_df[['Pclass',"Survived"]].groupby(['Pclass']).mean().sort_values(by = 'Survived', ascending=False)

from the above table we can see that passengers on Pclass1 had highest survival rate

In [None]:
train_df[['Sex','Survived']].groupby('Sex').mean().sort_values('Survived', ascending=False)

from the above table we can see that Female passengers had highest survival rate****

In [None]:
train_df[['SibSp','Survived']].groupby('SibSp').mean().sort_values('Survived',ascending =False)

In [None]:
train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

# Analyzing by Visualizing data

## Correlating numerical features 

Let us start by understanding correlations between numerical features and our solution goal (Survived)

In [None]:
g = sns.FacetGrid(train_df,col='Survived')
g.map(plt.hist, 'Age',bins=30);

Observations :


* Infants (Age <=4) had high survival rate.
* Oldest passengers (Age = 80) survived.
* Large number of 15-25 year olds did not survive.
* Most passengers are in 15-35 age range.


Decisions:

* We should consider Age (our assumption classifying #2) in our model training.
* Complete the Age feature for null values (completing #1).
* We should band age groups (creating #3)

## Correlating numerical and ordinal features

We can combine multiple features for identifying correlations using a single plot. This can be done with numerical and categorical features which have numeric values.

In [None]:
grid = sns.FacetGrid(train_df, col = 'Survived', row = 'Pclass', height=2.2, aspect =1.6)
grid.map(plt.hist, 'Age', alpha = 0.5, bins=20 )
grid.add_legend()

Observations:
* Pclass=3 had most passengers, however most did not survive. Confirms our classifying assumption #2.
* Infant passengers in Pclass=2 and Pclass=3 mostly survived. Further qualifies our classifying assumption #2.
* Most passengers in Pclass=1 survived. Confirms our classifying assumption #3.
* Pclass varies in terms of Age distribution of passengers.

Decision:
* Consider Pclass for model training.


## Correlating categorical features

Now we can correlate categorical features with our solution goal.

In [None]:
# the x category is the Pclass and the hue category is the Sex. Hence you need to add
# order = [1,2,3], hue_order=["male", "female"]

grid = sns.FacetGrid(train_df, row = 'Embarked', height = 2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass','Survived','Sex', order = [1,2,3], hue_order=["female","male"],palette = 'deep')
grid.add_legend()
plt.show()


Obsevations:

* female passengers has more survival rate than men.
* Males had better survival rate in Pclass=3 when compared with Pclass=2 for C and Q ports.


Decisions: 

* Add Sex feature to model training.
* Complete and add Embarked feature to model training

### Correlating categorical and numerical feature

We may also want to correlate categorical features (with non-numeric values) and numeric features. We can consider correlating Embarked (Categorical non-numeric), Sex (Categorical non-numeric), Fare (Numeric continuous), with Survived (Categorical numeric).

In [None]:
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', height =2.2, aspect=1.6 )
grid.map(sns.barplot, 'Sex','Fare', alpha=0.5, ci=None, order = ['male',"female"])
grid.add_legend();

observation:
* Higher fare paying passengers had better survival. Confirms our assumption for creating (#4) fare ranges.
* Port of embarkation correlates with survival rates. Confirms correlating (#1) and completing (#2)

Decisions:
* Consider banding decision feature

In [None]:
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
combine = [train_df, test_df]

"After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape

# Creating new feature extracting from existing

### Creating new feature extracting from existing

We want to analyze if Name feature can be engineered to extract titles and test correlation between titles and survival, before dropping Name and PassengerId features.

In the following code we extract Title feature using regular expressions. The RegEx pattern `(\w+\.)` matches the first word which ends with a dot character within Name feature. The `expand=False` flag returns a DataFrame.



In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])

We can replace many titles with a more common name or classify them as `Rare`.

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby('Title').mean()

Observations:
* Most titles band Age groups accurately. For example: Master title has Age mean of 5 years.
* Survival among Title Age bands varies slightly.
* Certain titles mostly survived (Mme, Lady, Sir) or did not (Don, Rev, Jonkheer).

Decision.

* We decide to retain the new Title feature for model training.

We can convert the categorical titles to ordinal.

In [None]:
title_mapping = {'Mr':1,'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_df.head()  

Now we can easily drop the Name feature from train ,test dataset. we also do not need Passenger_ID column from train" dataset

In [None]:
train_df = train_df.drop(["PassengerId","Name"],axis= 1)
test_df = test_df.drop(['Name'], axis=1)

combine = [train_df, test_df]



# Converting a categorical feature

Now we can convert features which contain strings to numerical values. This is required by most model algorithms. Doing so will also help us in achieving the feature completing goal.

Let us start by converting Sex feature to a new feature called Gender where female=1 and male=0.

In [None]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_df.head()

In [None]:
### COMPLETING or replacing the NAN values with relevant values

for dataset in combine:
    
    #complete missing age with median
    dataset['Age'].fillna(dataset['Age'].median(),inplace=True)
    
    #complete embarked with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0],inplace=True)
    
    #complete missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    

print("Training data with null values per column: \n",train_df.isnull().sum())
print("\n")

print("testing data with null values per column: \n", test_df.isnull().sum())




Let us create Age bands and determine correlations with Survived.

In [None]:
train_df['Age'] = train_df['Age'].astype(int)
train_df['Age'] = train_df['Age'].astype(int)
train_df['AgeBand'] = pd.cut(train_df['Age'],5 )
train_df[['AgeBand','Survived']].groupby('AgeBand',as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Let us replace Age with ordinals based on these bands.

In [None]:
for dataset in combine:
    dataset.loc[dataset['Age'] <=16, 'Age'] =0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4


We can now remove the AgeBand feature.

In [None]:
train_df =train_df.drop(['AgeBand'],axis = 1)
combine = [train_df, test_df]
train_df.head()

# Create new feature combining existing features

We can create a new feature for FamilySize which combines Parch and SibSp. This will enable us to drop Parch and SibSp from our datasets

In [None]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] +1
    
train_df[["FamilySize","Survived"]].groupby(["FamilySize"],as_index=False).mean().sort_values(by='Survived',ascending=False)

Using this we can now create new feature called IsAlone

In [None]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize']==1, 'IsAlone']=1

train_df[["IsAlone", "Survived"]].groupby(["IsAlone"], as_index = False).mean()

Let us drop Parch, Sibsp and FamilySize in favour of IaAlone

In [None]:
train_df = train_df.drop(["Parch","SibSp", "FamilySize"],axis=1)
test_df = test_df.drop(["Parch","SibSp", "FamilySize"],axis=1)

combine = [train_df, test_df]
train_df.head()

1. We can also create an artificial feature combining Pclass and Age.

In [None]:
for dataset in combine:
    dataset["Age*Class"] = dataset.Age * dataset.Pclass
    
train_df[["Age*Class", "Age","Pclass"]].head()    

# Converting Categorical feature to numeric

In [None]:
for dataset in combine:
    dataset['Embarked'] = dataset.Embarked.map({'S': 0, 'C': 1, 'Q': 2}) .astype(int)
    
train_df.head()    

# # Creating a new feature using Fare

In [None]:
train_df["FareBand"] = pd.qcut(train_df['Fare'],4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).count().sort_values(by='FareBand', ascending=True)

converting the Fare feature to ordinal values based on FareBand.

In [None]:
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

# Model, predict, Solve

In [None]:
# using Logistic regression
X_train = train_df.drop('Survived',axis=1)
Y_train = train_df['Survived']
X_test = test_df.drop('PassengerId',axis=1).copy()


In [None]:
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)
y_predict = logreg.predict(X_test)
logistics_regression_acc_log = round(logreg.score(X_train,Y_train)*100,2)
logistics_regression_acc_log

In [None]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

We can use Logistic Regression to validate our assumptions and decisions for feature creating and completing goals. This can be done by calculating the coefficient of the features in the decision function.

Positive coefficients increase the log-odds of the response (and thus increase the probability), and negative coefficients decrease the log-odds of the response (and thus decrease the probability).

* Sex is highest positivie coefficient, implying as the Sex value increases (male: 0 to female: 1), the probability of Survived=1 increases the most.
* Inversely as Pclass increases, probability of Survived=1 decreases the most.
* This way Age*Class is a good artificial feature to model as it has second highest negative correlation with Survived.
* So is Title as second highest positive correlation.

In [None]:
submission = pd.DataFrame({"PassengerId": test_df["PassengerId"],
                           "Survived":y_predict })
submission.to_csv('submission.csv', index=False)

In [None]:
nan

In [None]:
nan