In [1]:
from pandas import read_csv
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
trainloc='./train.csv'
testloc='./test.csv'
train_table=read_csv(trainloc)
test_table=read_csv(testloc)

train_table.columns,test_table.columns

(Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'))

# Deleting unnecessary columns

In [3]:
for _ in train_table.columns:
    print('%s = %d' %(_,train_table[_].isnull().values.sum()))
print(end='\n')
for _ in test_table.columns:
    print('%s = %d' %(_,test_table[_].isnull().values.sum()))

PassengerId = 0
Survived = 0
Pclass = 0
Name = 0
Sex = 0
Age = 177
SibSp = 0
Parch = 0
Ticket = 0
Fare = 0
Cabin = 687
Embarked = 2

PassengerId = 0
Pclass = 0
Name = 0
Sex = 0
Age = 86
SibSp = 0
Parch = 0
Ticket = 0
Fare = 1
Cabin = 327
Embarked = 0


In [4]:
del train_table['Embarked'] 
del train_table['Age']
del train_table['Cabin']

del test_table['Embarked'] 
del test_table['Age']
del test_table['Cabin']

In [5]:
train_table.loc[train_table.Sex=='female','Sex']=0
train_table.loc[train_table.Sex=='male','Sex']=1
train_table

test_table.loc[test_table.Sex=='female','Sex']=0
test_table.loc[test_table.Sex=='male','Sex']=1
test_table

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare
0,892,3,"Kelly, Mr. James",1,0,0,330911,7.8292
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,1,0,363272,7.0000
2,894,2,"Myles, Mr. Thomas Francis",1,0,0,240276,9.6875
3,895,3,"Wirz, Mr. Albert",1,0,0,315154,8.6625
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,1,1,3101298,12.2875
...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,0,0,A.5. 3236,8.0500
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,0,0,PC 17758,108.9000
415,1307,3,"Saether, Mr. Simon Sivertsen",1,0,0,SOTON/O.Q. 3101262,7.2500
416,1308,3,"Ware, Mr. Frederick",1,0,0,359309,8.0500


In [6]:
total_sibsp=train_table['SibSp'].sum()
total_parch=train_table['Parch'].sum()
total_sibsp,total_parch
train_table.describe()
train_table.groupby('Survived').size()

Survived
0    549
1    342
dtype: int64

# ML and test train models

In [7]:
x=train_table.iloc[:,[0,2,4]]
y=train_table.iloc[:,1]

test_df=test_table.iloc[:,[0,1,3]]

In [8]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.1,shuffle=False)

In [9]:
models=[]
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
results=[]
names=[]
model_mean={}
for name,model in models:
    kfold=StratifiedKFold(n_splits=10,random_state=1,shuffle=True)
    cv_results=cross_val_score(model,xtrain,ytrain,cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    model_mean[cv_results.mean()]=model

LR: 0.788997 (0.038179)
LDA: 0.788997 (0.038179)
KNN: 0.570478 (0.056615)
CART: 0.705324 (0.051510)
NB: 0.788997 (0.038179)
SVM: 0.590463 (0.033957)


In [10]:
best_model=model_mean[max(model_mean.keys())]
best_model.fit(xtrain,ytrain)

predictions = best_model.predict(xtest)
predictions_test_df = best_model.predict(test_df)
predictions_test_df,predictions_test_df.shape

(array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
        1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
        1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
        1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 