In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from itertools import product

## Get  Dataset

In [None]:
def get_classification(i,j,stepsize, X, y, classifiers,texts):
  x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
  y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  
  xx, yy = np.meshgrid(np.arange(x_min, x_max, stepsize),
                       np.arange(y_min, y_max, stepsize))

  f, axarr = plt.subplots(i,j, sharex='col', sharey='row', figsize=(10, 8))

  for idx, clf, tt in zip(product([0, i-1], [0, j-1]),
                          classifiers,
                          texts):
      Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
      Z = Z.reshape(xx.shape)

      axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
      axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
                                    s=20, edgecolor='k')
      axarr[idx[0], idx[1]].set_title(tt)

  plt.show()
  return classifiers

In [None]:
def report_classification( X, y,classifiers,texts):
  from sklearn.metrics import accuracy_score
  results = {}
  for clf, tt in zip(     classifiers,
                          texts):
      y_pred = clf.predict(X)
      results[tt] = accuracy_score(y, y_pred)


  return results

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/FeatureSelection/train_titanic.csv')

In [None]:
data.keys()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [None]:
y = data.Survived
X = data.drop(columns=['Survived'])


In [None]:
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# Analyze

In [None]:
X = X.drop(columns=['Name', 'Ticket'])

In [None]:
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,,S
1,2,1,1,female,38.0,1,0,71.2833,C85,C
2,3,1,3,female,26.0,0,0,7.925,,S
3,4,1,1,female,35.0,1,0,53.1,C123,S
4,5,0,3,male,35.0,0,0,8.05,,S


In [None]:
X.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
temp = encoder.fit_transform(X['Sex'].values.reshape(-1,1))
X['Sex'] = temp
X.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,7.25,,S
1,2,1,1,0,38.0,1,0,71.2833,C85,C
2,3,1,3,0,26.0,0,0,7.925,,S
3,4,1,1,0,35.0,1,0,53.1,C123,S
4,5,0,3,1,35.0,0,0,8.05,,S


In [None]:
def impute(df, columns, dft):
    df_temp = df.copy()
    for column in columns:
      df_temp[column] = df_temp[column].apply(lambda x: np.random.choice(dft[column].dropna().values) if pd.isnull(x) else x)
    return df_temp

In [None]:
X['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [None]:
X = impute(X, ['Embarked'], X)
encoder = LabelEncoder()
temp = encoder.fit_transform(X['Embarked'].values.reshape(-1,1))
X['Embarked'] = temp
X.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,7.25,,2
1,2,1,1,0,38.0,1,0,71.2833,C85,0
2,3,1,3,0,26.0,0,0,7.925,,2
3,4,1,1,0,35.0,1,0,53.1,C123,2
4,5,0,3,1,35.0,0,0,8.05,,2


In [None]:
mapper = {k:i for i, k in enumerate(X['Cabin'].unique(), 0)} 
# mapper[np.nan] = 'M'
X['Cabin'] = X['Cabin'].map(mapper)
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,1,22.0,1,0,7.25,0,2
1,2,1,1,0,38.0,1,0,71.2833,1,0
2,3,1,3,0,26.0,0,0,7.925,0,2
3,4,1,1,0,35.0,1,0,53.1,2,2
4,5,0,3,1,35.0,0,0,8.05,0,2


In [None]:
# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4)

In [None]:
from sklearn.impute import SimpleImputer
obj = SimpleImputer(missing_values = np.nan, strategy= 'most_frequent')
X_train = obj.fit_transform(X_train)
X_test = obj.transform(X_test)

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
classifiers = [
    DecisionTreeClassifier(max_depth=4).fit(X_train, y_train),
    KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train),
    SVC(gamma=.1, kernel='rbf', probability=True).fit(X_train, y_train),
    LogisticRegression().fit(X_train, y_train)
    ]
texts = [    "DecisionTreeClassifier",
              "KNeighborsClassifier",
              "SVC",
              "LogisticRegression"]

In [None]:
# classifiers = get_classification(2,2,0.1, X_test, y_test, classifiers, texts)

In [None]:
report_classification( X_test, y_test,classifiers,texts)

{'DecisionTreeClassifier': 1.0,
 'KNeighborsClassifier': 0.988795518207283,
 'LogisticRegression': 1.0,
 'SVC': 0.9971988795518207}