# Import Modules

In [162]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder  #for encoding the data to int values
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# Reading data

In [96]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data  = pd.read_csv('../input/titanic/test.csv')

In [97]:
# checking train data first 5 rows
train_data.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# dropping unwanted features

In [98]:
train_data = train_data.drop(['Ticket', 'Cabin','PassengerId','Name'], axis=1)
test_data = test_data.drop(['Ticket', 'Cabin','PassengerId','Name'], axis=1)

# checking null values
**cabin,age and embarked are conataining some null values**

In [99]:
print("Training data :","\n\n",train_data.isnull().sum(),"\n")
print('_'*10,"Testing data",'-'*10)
test_data.isnull().sum()

Training data : 

 Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64 

__________ Testing data ----------


Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

## filling null values

### age

In [100]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

### fare

In [101]:
train_data.Fare.fillna(train_data.Fare.mean(), inplace=True)
test_data.Fare.fillna(test_data.Fare.mean(), inplace=True)

### embarked

In [102]:
train_data.Embarked.fillna('S', inplace=True)
embarked_dummies = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
train_data = pd.concat([train_data, embarked_dummies], axis=1)
train_data.drop('Embarked', axis=1, inplace=True)

test_data.Embarked.fillna('S', inplace=True)
embarked_dummies1 = pd.get_dummies(test_data['Embarked'], prefix='Embarked')
test_data = pd.concat([test_data, embarked_dummies1], axis=1)
test_data.drop('Embarked', axis=1, inplace=True)

# data null values are filled

In [103]:
print("Training data :","\n\n",train_data.isnull().sum(),"\n")
print('_'*10,"Testing data",'-'*10)
test_data.isnull().sum()

Training data : 

 Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64 

__________ Testing data ----------


Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

# Encode the data to feed into our algorithms
*machine only treat int and float values*
* Male-1 Female-0

In [104]:
encounder=LabelEncoder()
train_data['Sex']=encounder.fit_transform(train_data['Sex'].values)
test_data['Sex']=encounder.fit_transform(test_data['Sex'].values)

train_data.head()
# test_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


# Data Insights

* ### 38% survival rate
* ### gender ratio is also good 64%

In [105]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208,0.188552,0.08642,0.725028
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.391372,0.281141,0.446751
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,0.0,0.0,0.0
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,0.0,0.0,1.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,0.0,0.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0


* ### maximum people survived are from 1 Pclass ~62%

In [42]:
train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [124]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,0,0,1
1,1,1,0,38.0,1,0,71.2833,1,0,0
2,1,3,0,26.0,0,0,7.925,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,1
4,0,3,1,35.0,0,0,8.05,0,0,1


In [126]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,0,1,0
1,3,0,47.0,1,0,7.0,0,0,1
2,2,1,62.0,0,0,9.6875,0,1,0
3,3,1,27.0,0,0,8.6625,0,0,1
4,3,0,22.0,1,1,12.2875,0,0,1


In [134]:
train= train_data
train = train.drop(['Survived'], axis=1)


In [152]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train, train_data['Survived'])

In [153]:
model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(train)
print (train_reduced.shape)

(891, 3)


In [154]:

test_reduced = model.transform(test_data)
print (test_reduced.shape)

(418, 3)


In [155]:
logreg = LogisticRegression()
logreg_cv = LogisticRegressionCV()
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()

models = [logreg, logreg_cv, rf, gboost]

In [156]:
for model in models:
    print ('Cross-validation of : {0}'.format(model.__class__))
    score = compute_score(clf=model, X=train_reduced, y=targets, scoring='accuracy')
    print ('CV score = {0}'.format(score))
    print ('****')

Cross-validation of : <class 'sklearn.linear_model._logistic.LogisticRegression'>


NameError: name 'compute_score' is not defined

In [157]:
y=train_data['Survived']
X = train

In [168]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
print(scaler.fit(X,y))

StandardScaler()


In [169]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.3,
                                                    random_state=10)

In [170]:
# Using RandomForestClassifier method of ensemble class to use Random Forest Classification algorithm

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=1,
                                    max_features=1,
                                    n_estimators = 9,
                                    random_state = 13,
    criterion='gini',
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
#     max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    verbose=0,
    warm_start=False,
    class_weight=None)
#-------------------------------------------------------------------------------------------------------
classifier.fit(X_train, Y_train)
# ---------------------------------------------------------------------------------------------------------------------
Y_pred = classifier.predict(X_test)
# ----------------------------------------------------------------------------------------------------------------------
from sklearn.metrics import accuracy_score, classification_report
print("                  Accuracy score=",accuracy_score(Y_test, Y_pred)*100,"%") #most accurate
# ------------------------------------------------------------------------------------------------------------------------
from sklearn.metrics import confusion_matrix #confusuon matrix for randomforest
pd.crosstab(Y_test, Y_pred)

                  Accuracy score= 67.53731343283582 %


col_0,0,1
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,174,0
1,87,7


In [171]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression(random_state = 0)
classifier1.fit(X_train, Y_train)
#Using KNeighborsClassifier Method of neighbors class to use Nearest Neighbor algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier2.fit(X_train, Y_train)
#Using SVC method of svm class to use Support Vector Machine Algorithm
from sklearn.svm import SVC
classifier3 = SVC(kernel = 'linear', random_state = 0)
classifier3.fit(X_train, Y_train)
# Using SVC method of svm class to use Kernel SVM Algorithm
from sklearn.svm import SVC
classifier4 = SVC(kernel = 'rbf', random_state = 1)
classifier4.fit(X_train, Y_train)
#Using GaussianNB method of naïve_bayes class to use Naïve Bayes Algorithm
from sklearn.naive_bayes import GaussianNB
classifier5 = GaussianNB()
classifier5.fit(X_train, Y_train)
#Using DecisionTreeClassifier of tree class to use Decision Tree Algorithm

from sklearn.tree import DecisionTreeClassifier
classifier6 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier6.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


DecisionTreeClassifier(criterion='entropy', random_state=0)

In [172]:
Y_pred1 = classifier1.predict(X_test)
Y_pred2 = classifier2.predict(X_test)
Y_pred3 = classifier3.predict(X_test)
Y_pred4 = classifier4.predict(X_test)
Y_pred5= classifier5.predict(X_test)
Y_pred6 = classifier6.predict(X_test)

In [173]:
print(accuracy_score(Y_test, Y_pred1))
print(accuracy_score(Y_test, Y_pred2))
print(accuracy_score(Y_test, Y_pred3))
print(accuracy_score(Y_test, Y_pred4))
print(accuracy_score(Y_test, Y_pred5))
print(accuracy_score(Y_test, Y_pred6))

0.8097014925373134
0.7201492537313433
0.8059701492537313
0.7201492537313433
0.7798507462686567
0.7723880597014925


test

In [174]:
pred = classifier1.predict(test_data)

In [179]:
u=pd.DataFrame(pred)

In [182]:
u.c

<bound method DataFrame.count of      0
0    0
1    0
2    0
3    0
4    1
..  ..
413  0
414  1
415  0
416  0
417  0

[418 rows x 1 columns]>

In [183]:
u.to_csv('mycsvfile.csv',index=False)