# Titanic example

- Survived - 생존 여부 (0 = 사망, 1 = 생존)
- Pclass - 티켓 클래스 (1 = 1등석, 2 = 2등석, 3 = 3등석)
- Sex - 성별
- Age - 나이
- SibSp - 함께 탑승한 자녀 / 배우자 의 수
- Parch - 함께 탑승한 부모님 / 아이들 의 수
- Ticket - 티켓 번호
- Fare - 탑승 요금
- Cabin - 수하물 번호
- Embarked - 선착장 (C = Cherbourg, Q = Queenstown, S = Southampton)

In [None]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Load datasets
train = pd.read_csv("train.csv")

In [None]:
train.shape

In [None]:
train.isnull().sum()

# ['Age'] is missing 177 values
# ['Cabin'] is missing 687 values
# ['Embarked'] is missing 2 values

In [None]:
train["Age"].fillna(train.groupby("Sex")["Age"].transform("mean"), inplace=True)

In [None]:
train.isnull().sum()

In [None]:
def visualization(feature):
    O = train[train['Survived'] == 1][feature].value_counts()
    X = train[train['Survived'] == 0][feature].value_counts()
    visual_df = pd.DataFrame([O, X])
    visual_df.index = ['Survived','Dead']
    visual_df.plot(kind = 'bar',stacked = True, figsize = (12, 5), title = feature)

In [None]:
visualization('Pclass')
# Survived : Likely to have lived when Pclass == 1
# Dead : Likely to have died when Pclass == 3

In [None]:
visualization('Sex')
# Survived : Likely to have lived when female
# Dead : Likely to have died when male

In [None]:
visualization('Embarked')
# Embarked == S : More likely to have died
# Embarked == C : More likely to have survived
# Embarked == Q : More likely to have died

In [None]:
train = train.drop(['Cabin', 'Embarked'],axis=1)

In [None]:
train.isnull().sum()

In [None]:
sex_mapping = {"male": 0, "female": 1}
train['Sex'] = train['Sex'].map(sex_mapping)

In [None]:
train_features=train.drop(['PassengerId', 'Survived', 'Name', 'Ticket','Parch', 'Fare', 'SibSp'], axis=1)
train_labels=train['Survived']

In [None]:
train_features

In [None]:
from sklearn.model_selection import train_test_split
num_test = 0.30
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels, test_size=num_test, random_state=23)

## Logistic regression

In [None]:
#standardiziation
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(train_features, train_labels)

In [None]:
print(model.score(train_features, train_labels))

In [None]:
model.coef_

In [None]:
print(model.score(test_features, test_labels))

In [None]:
train=pd.DataFrame(train_features)
train

- New data prediction 

In [None]:
import numpy as np
Jack = np.array([3,0.0, 20.0])
Rose = np.array([1,1.0, 17.0])
ME = np.array([1,0.0, 32.0])
sample_passengers = np.array([Jack, Rose, ME])

In [None]:
sample_passengers = scaler.transform(sample_passengers)

In [None]:
print(model.predict(sample_passengers))

In [None]:
print(model.predict_proba(sample_passengers))

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
a_index=list(range(1,11))
a=pd.Series()
x=[0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
    model=KNeighborsClassifier(n_neighbors=i)
    model.fit(train_features,train_labels)
    prediction=model.predict(test_features)
    a=a.append(pd.Series(metrics.accuracy_score(prediction,test_labels)))
plt.plot(a_index,a)
plt.xticks(x)
fig=plt.gcf()
fig.set_size_inches(12,6)
plt.show()
print('Accuracy of different values of n are : ',a.values,'with the max value as ',a.values.max())

## Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(train_features,train_labels)
prediction=model.predict(test_features)
print('Accuracy of the NaiveBayes is ',metrics.accuracy_score(prediction,test_labels))

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=100)
model.fit(train_features,train_labels)
prediction=model.predict(test_features)
print('Accuracy of the Random Forest is ',metrics.accuracy_score(prediction,test_labels))

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(train_features,train_labels)
prediction=model.predict(test_features)
print('Accuracy of the Decision Tree is ',metrics.accuracy_score(prediction,test_labels))

## Support vector machine

In [None]:
from sklearn import svm #Support Vector Machine
model=svm.SVC(kernel='rbf',C=1,gamma=0.1)
model.fit(train_features,train_labels)
prediction=model.predict(test_features)
print('Accuracy of rbf SVM is ',metrics.accuracy_score(prediction,test_labels))