In [None]:
import numpy as np
import pandas as pd
import re
accutacy_values=[]

In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf(self):
        return self.value is not None

In [None]:
titanic_train = pd.read_csv('train.csv.xls')
titanic_test = pd.read_csv('test.csv.xls').fillna(0)
full_data = [titanic_train, titanic_test]

In [None]:
#data preprocessing
titanic_train['Cabin'] = titanic_train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
titanic_test['Cabin'] = titanic_test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(titanic_train['Fare'].median())

for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)

    dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

    # Mapping titles
    title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
titanic_train = titanic_train.drop(drop_elements, axis = 1)
titanic_test  = titanic_test.drop(drop_elements, axis = 1)

In [None]:
class DecisionTree(object):
  def __init__(self,criterion='entropy',min_samples_split=10,max_depth=4,min_samples_leaf=2):
    self.root = None
    if criterion == 'entropy':
      self.criterion = self.entropy
    elif criterion == 'gini':
      self.criterion = self.gini
    else :
      return None
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth
    self.classes = None

  def fit(self, X, y):
    self.root = self._build_tree(X, y)

  def _build_tree(self, X, y, depth=0):
    self.n_samples,_  = X.shape
    self.n_features = np.array(X.columns)
    self.classes = len(np.unique(y))

    # stopping criteria
    if (depth >= self.max_depth or self.classes == 1 or self.n_samples < self.min_samples_split):
        most_common_Label = np.argmax(np.bincount(y))
        return Node(value=most_common_Label)
    #...

    # get best split
    best_feat, best_thresh = self._best_split(X, y, self.n_features)
    #...
    # grow children recursively
    left_idx, right_idx = self._create_split(X[best_feat], best_thresh)
    Xl = X.iloc[left_idx].reset_index(drop = True)#.drop(best_feat,axis = 1)
    Xr = X.iloc[right_idx].reset_index(drop = True)#.drop(best_feat,axis=1 )

    left_child = self._build_tree(Xl, y[left_idx], depth + 1)
    right_child = self._build_tree(Xr, y[right_idx], depth + 1)
    """left_child = self._build_tree(X.iloc[left_idx].drop(best_feat,axis=1 ), y[left_idx], depth + 1)
    right_child = self._build_tree(X.iloc[right_idx].drop(best_feat,axis=1 ), y[right_idx], depth + 1)"""
    return Node(best_feat, best_thresh, left_child, right_child)

  def _best_split(self, X, y, features):
      split = {'score':- 1, 'feat': None, 'thresh': None}

      for feat in features:
          X_feat = X[feat]
          thresholds = np.unique(X_feat)
          for thresh in thresholds:
              score = self.information_gain(X_feat, y, thresh)

              if score > split['score']:
                  split['score'] = score
                  split['feat'] = feat
                  split['thresh'] = thresh
      return split['feat'], split['thresh']

  def information_gain(self, X, y, thresh):
      parent_loss = self.criterion(y)
      left_idx, right_idx = self._create_split(X, thresh)
      n, n_left, n_right = len(y), len(left_idx), len(right_idx)

      if n_left == 0 or n_right == 0:
          return 0
      child_loss = (n_left / n) * self.criterion(y[left_idx]) + (n_right / n) * self.criterion(y[right_idx])

      return parent_loss - child_loss

  def entropy(self, y):
      proportions = np.bincount(y) / len(y)
      entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
      return entropy

  def gini(self,y):
      probablity = np.bincount(y) / len(y)
      return 1-np.sum([p**2 for p in probablity])
  def _create_split(self, X, thresh):
      left_idx = X[X <= thresh].index
      right_idx = X[X > thresh].index
      return left_idx, right_idx

  def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

  def predict(self, X):
      predictions = [self._traverse_tree(x[1],self.root) for x in X.iterrows()]
      return np.array(predictions)


In [None]:
classifer = DecisionTree(criterion = 'gini',max_depth=4)
root = classifer.fit(titanic_train.drop(['Survived'],axis=1),titanic_train['Survived'].to_numpy())

In [None]:
y_pred = classifer.predict(titanic_test)

In [None]:
Y = pd.read_csv('submission.csv')

In [None]:
def accuracy(y_pred,y_out):
  count=0
  for i in range(len(y_out)):
    if y_pred[i]==y_out[i]:
      count+=1
  return count/len(y_out)*100

In [None]:
print(accuracy(y_pred,Y['Survived'].values))
accutacy_values.append({"classifier":'Decession Tree',
                        "Accuracy": str(accuracy(y_pred,Y['Survived'].values))})

97.1291866028708


In [None]:
class RandomForest(object):
  def __init__(self,num_trees=10,min_features=5):
     self.num_trees = num_trees
     self.min_features=min_features
     self.trees = []
  def fit(self,X,y):
    for i in range(self.num_trees):
      col=pd.Series(X.columns)
      n = np.random.randint(self.min_features,len(col))
      col = col.sample(n,ignore_index=True)
      print(list(col))
      decession = DecisionTree()
      inp =X
      inp['survived'] = y
      inp = inp.sample(replace=True,ignore_index=True)
      decession.fit(inp[list(col)],inp['survived'])
      self.trees.append(decession.root)
     # print(self.trees[i])

  def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

  def predict(self, X):
    predictions = np.zeros((self.num_trees,len(X)))
    out = []
    for i in range(self.num_trees):
      predictions[i] =[self._traverse_tree(x[1],self.trees[i]) for x in X.iterrows()]
    for pre in predictions.T:
      count_0=0
      count_1=0
      for val in pre:
        if val==0:
          count_0+=1
        else: count_1+=1
      if  count_0>count_1:
        out.append(0)
      else: out.append(1)
    return out

In [None]:
random = RandomForest()
random.fit(titanic_train.drop(['Survived'],axis=1),titanic_train['Survived'].to_numpy())

['Title', 'Embarked', 'Parch', 'Pclass', 'Age', 'Fare', 'IsAlone']
['Pclass', 'Embarked', 'Parch', 'Fare', 'Title', 'survived']
['Age', 'Pclass', 'IsAlone', 'Sex', 'survived', 'Title', 'Fare', 'Embarked']
['Title', 'FamilySize', 'Parch', 'Fare', 'Age', 'Sex']
['FamilySize', 'Parch', 'Fare', 'IsAlone', 'survived', 'Embarked', 'Age', 'Pclass']
['IsAlone', 'Sex', 'Pclass', 'survived', 'FamilySize', 'Fare', 'Parch']
['Parch', 'Sex', 'Title', 'Embarked', 'IsAlone', 'Age']
['survived', 'Fare', 'Title', 'IsAlone', 'Age', 'Parch', 'Sex', 'Pclass', 'Embarked']
['Parch', 'Age', 'Embarked', 'Pclass', 'survived', 'Sex']
['IsAlone', 'Parch', 'survived', 'FamilySize', 'Fare', 'Sex', 'Age']


In [None]:
pred = random.predict(titanic_test)

In [None]:
print(accuracy(pred,Y['Survived'].values))
accutacy_values.append({"classifier":'Random Forest',
                        "Accuracy": str(accuracy(pred,Y['Survived'].values))})

59.80861244019139


In [None]:
def compute_error(y, y_pred, w):

    return (sum(w * (np.not_equal(y, y_pred)).astype(int)))/sum(w)

def compute_alpha(error):
    return np.log((1 - error) / error)

def update_weights(w, alpha, y, y_pred):

    return w * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))

In [None]:
class AdaBoost:

    def __init__(self):
        self.alphas = []
        self.classifiers = []
        self.M = None
        self.training_errors = []
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        self.alphas = []
        self.training_errors = []
        self.M = M
        for m in range(0, M):
            if m == 0:
                w = np.ones(len(y)) * 1 / len(y)
            else:
                w = update_weights(w, alpha, y, y_pred)

            classifiers = DecisionTree(max_depth = 1)
            classifiers.fit(X, y, sample_weight = w)
            y_pred = classifiers.predict(X)

            self.classifiers.append(classifiers) # Save to list of weak classifiers

            # (b) Compute error
            error = compute_error(y, y_pred, w)
            self.training_errors.append(error)
            alpha = compute_alpha(error)
            self.alphas.append(alpha)

    def predict(self, X):

        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M))
        for m in range(self.M):
            y_pred_m = self.classifiers[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:,m] = y_pred_m

        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

In [None]:
df=pd.DataFrame(accutacy_values)

In [None]:
df

Unnamed: 0,classifier,Accuracy
0,Decession Tree,97.1291866028708
1,Random Forest,59.80861244019139
