In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

class DecisionTreeNode():
    def __init__(self, left, right, reference_value, output, feature_index):
      self.left = left
      self.right = right
      self.reference_value = reference_value
      self.output = output
      self.feature_index = feature_index


class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, max_depth, min_split, depth = 0):
        self.max_depth = max_depth
        self.min_split = min_split
        self.depth = depth
        self.root = None


  def fit(self, X, y):
        self.X = pd.DataFrame(X)
        self.y = pd.DataFrame(y)
        self.root= self.find_lowest_gini(X,y)


  def find_lowest_gini(self, X, y):
        lowest_gini = 1
        best_x_left = None
        best_x_right = None
        best_y_left = None
        best_y_right = None
        best_reference_value = None
        best_feature_index = None

        n_students, n_features = X.shape

        for feature_index in range(n_features):
          list_types_of_scores = pd.unique(X.iloc[:, feature_index])
          list_types_of_scores = sorted(list_types_of_scores)
          n_types = len(list_types_of_scores)

          if n_types == 0:
              break

          if n_types <= 2:
            reference_value = list_types_of_scores[0]

            X_left = pd.DataFrame(X[X.iloc[:, feature_index] >= reference_value])
            X_right = pd.DataFrame(X[X.iloc[:, feature_index] < reference_value])

            y_left = pd.DataFrame(y[X.iloc[:, feature_index] >= reference_value])
            y_right = pd.DataFrame(y[X.iloc[:, feature_index] < reference_value])


            gini_score = self.gini(y_left, y_right)

            if gini_score < lowest_gini:
                  lowest_gini = gini_score
                  best_x_left = pd.DataFrame(X_left)
                  best_x_right = pd.DataFrame(X_right)
                  best_y_left = pd.DataFrame(y_left)
                  best_y_right = pd.DataFrame(y_right)

                  best_x_left.index = pd.Index(range(len(best_x_left)))
                  best_x_right.index = pd.Index(range(len(best_x_right)))
                  best_y_left.index = pd.Index(range(len(best_y_left)))
                  best_y_right.index = pd.Index(range(len(best_y_right)))

                  best_reference_value = reference_value
                  best_feature_index = feature_index

          else:
            for type_in_feature in range(n_types - 1):
                reference_value = (list_types_of_scores[type_in_feature] + list_types_of_scores[type_in_feature + 1]) / 2

                X_left = X[X.iloc[:, feature_index] >= reference_value]
                X_right = X[X.iloc[:, feature_index] < reference_value]

                y_left = y[X.iloc[:, feature_index] >= reference_value]
                y_right = y[X.iloc[:, feature_index] < reference_value]

                gini_score = self.gini(y_left, y_right)

                if gini_score < lowest_gini:
                    lowest_gini = gini_score
                    best_x_left = pd.DataFrame(X_left)
                    best_x_right = pd.DataFrame(X_right)
                    best_y_left = pd.DataFrame(y_left)
                    best_y_right = pd.DataFrame(y_right)

                    best_x_left.index = pd.Index(range(len(best_x_left)))
                    best_x_right.index = pd.Index(range(len(best_x_right)))
                    best_y_left.index = pd.Index(range(len(best_y_left)))
                    best_y_right.index = pd.Index(range(len(best_y_right)))

                    best_reference_value = reference_value
                    best_feature_index = feature_index

        return self.append_tree_check(best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree_check(self, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y):
    if best_x_left.empty or best_y_left.empty:
        x_overall = pd.DataFrame(best_x_right)
        y_overall = pd.DataFrame(best_y_right)
        total_data = len(x_overall)


    elif best_x_right.empty or best_y_right.empty:
        x_overall = pd.DataFrame(best_x_left)
        y_overall = pd.DataFrame(best_y_left)
        total_data = len(x_overall)

    else:
      x_overall_temp = [best_x_left, best_x_right]
      x_overall = pd.concat(x_overall_temp)
      total_data = len(x_overall)
      y_overall_temp = [best_y_left, best_y_right]
      y_overall = pd.concat(y_overall_temp)

    return self.append_tree(x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree(self, x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value,best_feature_index, lowest_gini, y):

      if total_data < self.min_split or self.depth >= self.max_depth or lowest_gini == 0 or best_x_left.empty or best_x_right.empty or best_y_left.empty or best_y_right.empty:
        final_output = int(y_overall["result_1"].mode().iloc[0])
        return DecisionTreeNode(left = None, right = None, reference_value = None, feature_index = None, output = final_output)

      else:
        final_output = None
        left_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        right_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        left_node.fit(best_x_left, best_y_left)
        right_node.fit(best_x_right, best_y_right)
        node = DecisionTreeNode(left = left_node, right = right_node, reference_value = best_reference_value, feature_index = best_feature_index, output = None)
        print(node.feature_index)
        print(node.reference_value)
        print(node.left)
        print(node.right)
        return node

  def gini(self, y_left, y_right):
          y_right = pd.DataFrame(y_right)
          y_left = pd.DataFrame(y_left)

          n_students_left = len(y_left)
          n_students_right = len(y_right)

          possibilities_left = y_left.value_counts() / n_students_left
          possibilities_right = y_right.value_counts() / n_students_right

          gini_left = 1 - np.sum(possibilities_left ** 2)
          gini_right = 1 - np.sum(possibilities_right ** 2)
          gini_overall = (gini_left * n_students_left + gini_right * n_students_right) / (n_students_left + n_students_right)

          return gini_overall

  def predict(self, X):
        predictions = []
        for index, x in X.iterrows():
            current_node = self.root
            while current_node.feature_index != None:
                if x[current_node.feature_index] < current_node.reference_value:
                    current_node = current_node.right.root
                elif x[current_node.feature_index] >= current_node.reference_value:
                    current_node = current_node.left.root

            predictions.append(current_node.output)
        return np.array(predictions)

  def score(self, X, y):
        pred = self.predict(X)
        accuracy = (pred == y).mean()
        return accuracy



Stage 1 Results

In [None]:
# Step 1: Read the data from the Excel file
df = pd.read_excel("final_data492_collab.xlsx", decimal=',')

# Step 2: Drop unnecessary columns and separate features and target variable
df_dropped = df.drop(columns=['interview_score', 'result_2', 'year'])
X = df_dropped.drop(columns=['result_1'])
y = df_dropped['result_1']


# Step 3: Standardize the feature values
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X))


stage_1_tree = DecisionTree(min_split = 9, max_depth = 14, depth = 0)
cv_scores = cross_val_score(stage_1_tree, X, y, cv=5)  # 5-fold cross-validation

print(cv_scores)
print("Mean cv score: ", cv_scores.mean())


[0.87850467 0.85046729 0.89622642 0.88679245 0.81132075]
Mean cv score:  0.8646623170516664


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

class DecisionTreeNode():
    def __init__(self, left, right, reference_value, output, feature_index):
      self.left = left
      self.right = right
      self.reference_value = reference_value
      self.output = output
      self.feature_index = feature_index


class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, max_depth, min_split, depth = 0):
        self.max_depth = max_depth
        self.min_split = min_split
        self.depth = depth
        self.root = None


  def fit(self, X, y):
        self.X = pd.DataFrame(X)
        self.y = pd.DataFrame(y)
        self.root= self.find_lowest_gini(X,y)


  def find_lowest_gini(self, X, y):
        lowest_gini = 1
        best_x_left = None
        best_x_right = None
        best_y_left = None
        best_y_right = None
        best_reference_value = None
        best_feature_index = None

        n_students, n_features = X.shape

        for feature_index in range(n_features):
          list_types_of_scores = pd.unique(X.iloc[:, feature_index])
          list_types_of_scores = sorted(list_types_of_scores)
          n_types = len(list_types_of_scores)

          if n_types == 0:
              break

          if n_types <= 2:
            reference_value = list_types_of_scores[0]

            X_left = pd.DataFrame(X[X.iloc[:, feature_index] >= reference_value])
            X_right = pd.DataFrame(X[X.iloc[:, feature_index] < reference_value])

            y_left = pd.DataFrame(y[X.iloc[:, feature_index] >= reference_value])
            y_right = pd.DataFrame(y[X.iloc[:, feature_index] < reference_value])


            gini_score = self.gini(y_left, y_right)

            if gini_score < lowest_gini:
                  lowest_gini = gini_score
                  best_x_left = pd.DataFrame(X_left)
                  best_x_right = pd.DataFrame(X_right)
                  best_y_left = pd.DataFrame(y_left)
                  best_y_right = pd.DataFrame(y_right)

                  best_x_left.index = pd.Index(range(len(best_x_left)))
                  best_x_right.index = pd.Index(range(len(best_x_right)))
                  best_y_left.index = pd.Index(range(len(best_y_left)))
                  best_y_right.index = pd.Index(range(len(best_y_right)))

                  best_reference_value = reference_value
                  best_feature_index = feature_index

          else:
            for type_in_feature in range(n_types - 1):
                reference_value = (list_types_of_scores[type_in_feature] + list_types_of_scores[type_in_feature + 1]) / 2

                X_left = X[X.iloc[:, feature_index] >= reference_value]
                X_right = X[X.iloc[:, feature_index] < reference_value]

                y_left = y[X.iloc[:, feature_index] >= reference_value]
                y_right = y[X.iloc[:, feature_index] < reference_value]

                gini_score = self.gini(y_left, y_right)

                if gini_score < lowest_gini:
                    lowest_gini = gini_score
                    best_x_left = pd.DataFrame(X_left)
                    best_x_right = pd.DataFrame(X_right)
                    best_y_left = pd.DataFrame(y_left)
                    best_y_right = pd.DataFrame(y_right)

                    best_x_left.index = pd.Index(range(len(best_x_left)))
                    best_x_right.index = pd.Index(range(len(best_x_right)))
                    best_y_left.index = pd.Index(range(len(best_y_left)))
                    best_y_right.index = pd.Index(range(len(best_y_right)))

                    best_reference_value = reference_value
                    best_feature_index = feature_index

        return self.append_tree_check(best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree_check(self, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y):
    if best_x_left.empty or best_y_left.empty:
        x_overall = pd.DataFrame(best_x_right)
        y_overall = pd.DataFrame(best_y_right)
        total_data = len(x_overall)


    elif best_x_right.empty or best_y_right.empty:
        x_overall = pd.DataFrame(best_x_left)
        y_overall = pd.DataFrame(best_y_left)
        total_data = len(x_overall)

    else:
      x_overall_temp = [best_x_left, best_x_right]
      x_overall = pd.concat(x_overall_temp)
      total_data = len(x_overall)
      y_overall_temp = [best_y_left, best_y_right]
      y_overall = pd.concat(y_overall_temp)

    return self.append_tree(x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree(self, x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value,best_feature_index, lowest_gini, y):

      if total_data < self.min_split or self.depth >= self.max_depth or lowest_gini == 0 or best_x_left.empty or best_x_right.empty or best_y_left.empty or best_y_right.empty:
        final_output = int(y_overall[0].mode().iloc[0])
        return DecisionTreeNode(left = None, right = None, reference_value = None, feature_index = None, output = final_output)

      else:
        final_output = None
        left_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        right_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        left_node.fit(best_x_left, best_y_left)
        right_node.fit(best_x_right, best_y_right)
        node = DecisionTreeNode(left = left_node, right = right_node, reference_value = best_reference_value, feature_index = best_feature_index, output = None)
        print(node.feature_index)
        print(node.reference_value)
        print(node.left)
        print(node.right)
        return node


  def gini(self, y_left, y_right):
          y_right = pd.DataFrame(y_right)
          y_left = pd.DataFrame(y_left)

          n_students_left = len(y_left)
          n_students_right = len(y_right)

          possibilities_left = y_left.value_counts() / n_students_left
          possibilities_right = y_right.value_counts() / n_students_right

          gini_left = 1 - np.sum(possibilities_left ** 2)
          gini_right = 1 - np.sum(possibilities_right ** 2)
          gini_overall = (gini_left * n_students_left + gini_right * n_students_right) / (n_students_left + n_students_right)

          return gini_overall

  def predict(self, X):
        predictions = []
        for index, x in X.iterrows():
            current_node = self.root
            while current_node.feature_index != None:
                if x[current_node.feature_index] < current_node.reference_value:
                    current_node = current_node.right.root
                elif x[current_node.feature_index] >= current_node.reference_value:
                    current_node = current_node.left.root

            predictions.append(current_node.output)
        return np.array(predictions)

  def predict(self, X):
        predictions = []
        for index, x in X.iterrows():
            current_node = self.root
            while current_node.feature_index != None:
                if x[current_node.feature_index] < current_node.reference_value:
                    current_node = current_node.right.root
                elif x[current_node.feature_index] >= current_node.reference_value:
                    current_node = current_node.left.root

            predictions.append(current_node.output)
        return np.array(predictions)

  def score(self, X, y):
        pred = self.predict(X)
        accuracy = (pred == y).mean()
        return accuracy, pred

In [None]:
# Step 1: Read the data, specifying the decimal separator
df = pd.read_excel("final_data492_collab.xlsx", decimal=',')

# Step 2: Separate features and target variable
X = df.drop(columns=['result_1'])
y = df['result_1'].values


# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 4: Store the dropped columns and drop them from the training and testing sets
dropped_columns_train = X_train[['year', 'result_2', 'interview_score']]
dropped_columns_test = X_test[['year', 'result_2',  'interview_score']]

X_train.drop(columns=['year', 'result_2','interview_score'], inplace = True)
X_test.drop(columns=['year', 'result_2', 'interview_score'], inplace = True)

# Step 5: Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))


# Step 6: Fit the model, make predictions, and calculate the accuracy
stage_1_tree = DecisionTree(min_split = 9, max_depth = 14, depth = 0)
stage_1_tree.fit(X_train, y_train) # X_train_scaled
accuracy, pred = stage_1_tree.score(X_test, y_test) # x_test_scaled
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, pred)

# Print the report
print("Classification Report:")
print(report)

# Step 7: Merge predictions, actual labels, and features with dropped columns
results_df = pd.DataFrame({'Predicted_Result_1': pred, 'Actual_Result_1': y_test})
X_test_df = pd.DataFrame(X_test, columns=X_test.columns)  # Convert X_test array to DataFrame
results_df = pd.concat([results_df, X_test_df.reset_index(drop=True), dropped_columns_test.reset_index(drop=True)], axis=1)

# Step 9: Save to Excel
results_df.to_excel('final_data_with_predictions_stage_1.xlsx', index=False)

4
91.71000289916992
DecisionTree(depth=4, max_depth=14, min_split=9)
DecisionTree(depth=4, max_depth=14, min_split=9)
4
94.75
DecisionTree(depth=5, max_depth=14, min_split=9)
DecisionTree(depth=5, max_depth=14, min_split=9)
3
3.515
DecisionTree(depth=4, max_depth=14, min_split=9)
DecisionTree(depth=4, max_depth=14, min_split=9)
5
437.51181
DecisionTree(depth=3, max_depth=14, min_split=9)
DecisionTree(depth=3, max_depth=14, min_split=9)
3
2.865
DecisionTree(depth=6, max_depth=14, min_split=9)
DecisionTree(depth=6, max_depth=14, min_split=9)
3
2.625
DecisionTree(depth=5, max_depth=14, min_split=9)
DecisionTree(depth=5, max_depth=14, min_split=9)
4
93.292135
DecisionTree(depth=4, max_depth=14, min_split=9)
DecisionTree(depth=4, max_depth=14, min_split=9)
5
480.320915
DecisionTree(depth=3, max_depth=14, min_split=9)
DecisionTree(depth=3, max_depth=14, min_split=9)
3
2.915
DecisionTree(depth=2, max_depth=14, min_split=9)
DecisionTree(depth=2, max_depth=14, min_split=9)
3
3.0
DecisionTree(de

Stage 2

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

class DecisionTreeNode():
    def __init__(self, left, right, reference_value, output, feature_index):
      self.left = left
      self.right = right
      self.reference_value = reference_value
      self.output = output
      self.feature_index = feature_index


class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, max_depth, min_split, depth = 0):
        self.max_depth = max_depth
        self.min_split = min_split
        self.depth = depth
        self.root = None


  def fit(self, X, y):
        self.X = pd.DataFrame(X)
        self.y = pd.DataFrame(y)
        self.root= self.find_lowest_gini(X,y)


  def find_lowest_gini(self, X, y):
        lowest_gini = 1
        best_x_left = None
        best_x_right = None
        best_y_left = None
        best_y_right = None
        best_reference_value = None
        best_feature_index = None

        n_students, n_features = X.shape

        for feature_index in range(n_features):
          list_types_of_scores = pd.unique(X.iloc[:, feature_index])
          list_types_of_scores = sorted(list_types_of_scores)
          n_types = len(list_types_of_scores)

          if n_types == 0:
              break

          if n_types <= 2:
            reference_value = list_types_of_scores[0]

            X_left = pd.DataFrame(X[X.iloc[:, feature_index] >= reference_value])
            X_right = pd.DataFrame(X[X.iloc[:, feature_index] < reference_value])

            y_left = pd.DataFrame(y[X.iloc[:, feature_index] >= reference_value])
            y_right = pd.DataFrame(y[X.iloc[:, feature_index] < reference_value])


            gini_score = self.gini(y_left, y_right)

            if gini_score < lowest_gini:
                  lowest_gini = gini_score
                  best_x_left = pd.DataFrame(X_left)
                  best_x_right = pd.DataFrame(X_right)
                  best_y_left = pd.DataFrame(y_left)
                  best_y_right = pd.DataFrame(y_right)
                  best_reference_value = reference_value
                  best_feature_index = feature_index

                  best_x_left.index = pd.Index(range(len(best_x_left)))
                  best_x_right.index = pd.Index(range(len(best_x_right)))
                  best_y_left.index = pd.Index(range(len(best_y_left)))
                  best_y_right.index = pd.Index(range(len(best_y_right)))

          else:
            for type_in_feature in range(n_types - 1):
                reference_value = (list_types_of_scores[type_in_feature] + list_types_of_scores[type_in_feature + 1]) / 2

                X_left = X[X.iloc[:, feature_index] >= reference_value]
                X_right = X[X.iloc[:, feature_index] < reference_value]

                y_left = y[X.iloc[:, feature_index] >= reference_value]
                y_right = y[X.iloc[:, feature_index] < reference_value]

                gini_score = self.gini(y_left, y_right)

                if gini_score < lowest_gini:
                    lowest_gini = gini_score
                    best_x_left = pd.DataFrame(X_left)
                    best_x_right = pd.DataFrame(X_right)
                    best_y_left = pd.DataFrame(y_left)
                    best_y_right = pd.DataFrame(y_right)

                    best_x_left.index = pd.Index(range(len(best_x_left)))
                    best_x_right.index = pd.Index(range(len(best_x_right)))
                    best_y_left.index = pd.Index(range(len(best_y_left)))
                    best_y_right.index = pd.Index(range(len(best_y_right)))

                    best_reference_value = reference_value
                    best_feature_index = feature_index


        return self.append_tree_check(best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree_check(self, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y):
    if best_x_left.empty or best_y_left.empty:
        x_overall = pd.DataFrame(best_x_right)
        y_overall = pd.DataFrame(best_y_right)
        total_data = len(x_overall)


    elif best_x_right.empty or best_y_right.empty:
        x_overall = pd.DataFrame(best_x_left)
        y_overall = pd.DataFrame(best_y_left)
        total_data = len(x_overall)

    else:
      x_overall_temp = [best_x_left, best_x_right]
      x_overall = pd.concat(x_overall_temp)
      total_data = len(x_overall)
      y_overall_temp = [best_y_left, best_y_right]
      y_overall = pd.concat(y_overall_temp)

    return self.append_tree(x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree(self, x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value,best_feature_index, lowest_gini, y):

      if total_data < self.min_split or self.depth >= self.max_depth or lowest_gini == 0 :

        final_output = int(y_overall[0].mode().iloc[0])
        return DecisionTreeNode(left = None, right = None, reference_value = None, feature_index = None, output = final_output)

      else:
        final_output = None
        left_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        right_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        left_node.fit(best_x_left, best_y_left)
        right_node.fit(best_x_right, best_y_right)
        return DecisionTreeNode(left = left_node, right = right_node, reference_value = best_reference_value, feature_index = best_feature_index, output = None)

  def gini(self, y_left, y_right):
          y_right = pd.DataFrame(y_right)
          y_left = pd.DataFrame(y_left)

          n_students_left = len(y_left)
          n_students_right = len(y_right)

          possibilities_left = y_left.value_counts() / n_students_left
          possibilities_right = y_right.value_counts() / n_students_right

          gini_left = 1 - np.sum(possibilities_left ** 2)
          gini_right = 1 - np.sum(possibilities_right ** 2)
          gini_overall = (gini_left * n_students_left + gini_right * n_students_right) / (n_students_left + n_students_right)

          return gini_overall

  def predict(self, X):
        predictions = []
        for index, x in X.iterrows():
            current_node = self.root
            while current_node.feature_index != None:
                if x[current_node.feature_index] < current_node.reference_value:
                    current_node = current_node.right.root
                elif x[current_node.feature_index] >= current_node.reference_value:
                    current_node = current_node.left.root

            predictions.append(current_node.output)
        return np.array(predictions)

  def score(self, X, y):

        pred = self.predict(X)
        accuracy = (pred == y[0]).mean()

        return accuracy

In [None]:
df = pd.read_excel("final_data492_collab.xlsx", decimal=',')

df_interview = df[df["result_1"] == 2].copy()
df_interview.dropna(inplace=True)

df_interview.drop(columns=['result_1', 'year'], inplace=True)

X = df_interview.drop(columns=['result_2']).values
y = df_interview['result_2'].values

X = pd.DataFrame(X)
y = pd.DataFrame(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled)

stage_2_tree = DecisionTree(min_split = 9, max_depth = 14, depth = 0)
cv_scores = cross_val_score(stage_2_tree, X_scaled, y, cv=5)  # 5-fold cross-validation

print(cv_scores)
print("Mean CV Score:", cv_scores.mean())

[0.9375     0.9375     0.77083333 0.91666667 0.8125    ]
Mean CV Score: 0.875


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


class DecisionTreeNode():
    def __init__(self, left, right, reference_value, output, feature_index):
      self.left = left
      self.right = right
      self.reference_value = reference_value
      self.output = output
      self.feature_index = feature_index


class DecisionTree(BaseEstimator, ClassifierMixin):
  def __init__(self, max_depth, min_split, depth = 0):
        self.max_depth = max_depth
        self.min_split = min_split
        self.depth = depth
        self.root = None


  def fit(self, X, y):
        self.X = pd.DataFrame(X)
        self.y = pd.DataFrame(y)
        self.root= self.find_lowest_gini(X,y)


  def find_lowest_gini(self, X, y):
        lowest_gini = 1
        best_x_left = None
        best_x_right = None
        best_y_left = None
        best_y_right = None
        best_reference_value = None
        best_feature_index = None

        n_students, n_features = X.shape

        for feature_index in range(n_features):
          list_types_of_scores = pd.unique(X.iloc[:, feature_index])
          list_types_of_scores = sorted(list_types_of_scores)
          n_types = len(list_types_of_scores)

          if n_types == 0:
              break

          if n_types <= 2:
            reference_value = list_types_of_scores[0]

            X_left = pd.DataFrame(X[X.iloc[:, feature_index] >= reference_value])
            X_right = pd.DataFrame(X[X.iloc[:, feature_index] < reference_value])

            y_left = pd.DataFrame(y[X.iloc[:, feature_index] >= reference_value])
            y_right = pd.DataFrame(y[X.iloc[:, feature_index] < reference_value])


            gini_score = self.gini(y_left, y_right)

            if gini_score < lowest_gini:
                  lowest_gini = gini_score
                  best_x_left = pd.DataFrame(X_left)
                  best_x_right = pd.DataFrame(X_right)
                  best_y_left = pd.DataFrame(y_left)
                  best_y_right = pd.DataFrame(y_right)
                  best_reference_value = reference_value
                  best_feature_index = feature_index

                  best_x_left.index = pd.Index(range(len(best_x_left)))
                  best_x_right.index = pd.Index(range(len(best_x_right)))
                  best_y_left.index = pd.Index(range(len(best_y_left)))
                  best_y_right.index = pd.Index(range(len(best_y_right)))

          else:
            for type_in_feature in range(n_types - 1):
                reference_value = (list_types_of_scores[type_in_feature] + list_types_of_scores[type_in_feature + 1]) / 2

                X_left = X[X.iloc[:, feature_index] >= reference_value]
                X_right = X[X.iloc[:, feature_index] < reference_value]

                y_left = y[X.iloc[:, feature_index] >= reference_value]
                y_right = y[X.iloc[:, feature_index] < reference_value]

                gini_score = self.gini(y_left, y_right)

                if gini_score < lowest_gini:
                    lowest_gini = gini_score
                    best_x_left = pd.DataFrame(X_left)
                    best_x_right = pd.DataFrame(X_right)
                    best_y_left = pd.DataFrame(y_left)
                    best_y_right = pd.DataFrame(y_right)

                    best_x_left.index = pd.Index(range(len(best_x_left)))
                    best_x_right.index = pd.Index(range(len(best_x_right)))
                    best_y_left.index = pd.Index(range(len(best_y_left)))
                    best_y_right.index = pd.Index(range(len(best_y_right)))

                    best_reference_value = reference_value
                    best_feature_index = feature_index

        return self.append_tree_check(best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree_check(self, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y):
    if best_x_left.empty or best_y_left.empty:
        x_overall = pd.DataFrame(best_x_right)
        y_overall = pd.DataFrame(best_y_right)
        total_data = len(x_overall)


    elif best_x_right.empty or best_y_right.empty:
        x_overall = pd.DataFrame(best_x_left)
        y_overall = pd.DataFrame(best_y_left)
        total_data = len(x_overall)

    else:
      x_overall_temp = [best_x_left, best_x_right]
      x_overall = pd.concat(x_overall_temp)
      total_data = len(x_overall)
      y_overall_temp = [best_y_left, best_y_right]
      y_overall = pd.concat(y_overall_temp)

    return self.append_tree(x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value, best_feature_index, lowest_gini, y)

  def append_tree(self, x_overall, y_overall, total_data, best_x_left, best_x_right, best_y_left, best_y_right, best_reference_value,best_feature_index, lowest_gini, y):

      if total_data < self.min_split or self.depth >= self.max_depth or lowest_gini == 0 :

        final_output = int(y_overall[0].mode().iloc[0])
        return DecisionTreeNode(left = None, right = None, reference_value = None, feature_index = None, output = final_output)

      else:
        final_output = None
        left_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        right_node = DecisionTree(max_depth = self.max_depth, min_split = self.min_split, depth = self.depth + 1)
        left_node.fit(best_x_left, best_y_left)
        right_node.fit(best_x_right, best_y_right)
        return DecisionTreeNode(left = left_node, right = right_node, reference_value = best_reference_value, feature_index = best_feature_index, output = None)

  def gini(self, y_left, y_right):
          y_right = pd.DataFrame(y_right)
          y_left = pd.DataFrame(y_left)

          n_students_left = len(y_left)
          n_students_right = len(y_right)

          possibilities_left = y_left.value_counts() / n_students_left
          possibilities_right = y_right.value_counts() / n_students_right

          gini_left = 1 - np.sum(possibilities_left ** 2)
          gini_right = 1 - np.sum(possibilities_right ** 2)
          gini_overall = (gini_left * n_students_left + gini_right * n_students_right) / (n_students_left + n_students_right)

          return gini_overall

  def predict(self, X):
        predictions = []
        for index, x in X.iterrows():
            current_node = self.root
            while current_node.feature_index != None:
                if x[current_node.feature_index] < current_node.reference_value:
                    current_node = current_node.right.root
                elif x[current_node.feature_index] >= current_node.reference_value:
                    current_node = current_node.left.root

            predictions.append(current_node.output)
        return np.array(predictions)

  def score(self, X, y):

        pred = self.predict(X)
        accuracy = (pred == y[0]).mean()

        return accuracy, pred

In [None]:
df = pd.read_excel("final_data492_collab.xlsx", decimal=',')

df_interview = df[df["result_1"] == 2].copy()
df_interview.dropna(inplace=True)

df_interview.drop(columns=['result_1', 'year'], inplace=True)

X = df_interview.drop(columns=['result_2']).values
y = df_interview['result_2'].values

X = pd.DataFrame(X)
y = pd.DataFrame(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
X_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(y_test)

stage_2_tree = DecisionTree(min_split = 9, max_depth = 14, depth = 0)

stage_2_tree.fit(X_train, y_train)

accuracy, pred = stage_2_tree.score(X_test, y_test)

# Generate classification report
report = classification_report(y_test, pred)

# Print the report
print("Classification Report:")
print(report)
print(accuracy)

# Step 7: Merge predictions, actual labels, and features with dropped columns
results_df = pd.DataFrame({'Predicted_Result_2': pred, 'Actual_Result_2': y_test[0]})
X_test_df = pd.DataFrame(X_test, columns=X_test.columns)  # Convert X_test array to DataFrame
results_df = pd.concat([results_df, X_test_df], axis=1)

# Step 9: Save to Excel
results_df.to_excel('final_data_with_predictions_stage_2.xlsx', index=False)

Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.90      0.88        21
         1.0       0.92      0.89      0.91        27

    accuracy                           0.90        48
   macro avg       0.89      0.90      0.89        48
weighted avg       0.90      0.90      0.90        48

0.8958333333333334
