veri görselleştirme https://batuhandaz.medium.com/decision-tree-algoritmas%C4%B1-karar-a%C4%9Fac%C4%B1-machine-learning-78d856b1f457
https://ece-akdagli.medium.com/makine-%C3%B6%C4%9Frenmesinde-decision-tree-42a86502ee75
https://erdincuzun.com/makine_ogrenmesi/decision-tree-karar-agaci-id3-algoritmasi-classification-siniflama/

### Import Librarires And Dataset

In [None]:
import warnings
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#some settings to show data
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

# target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data")
target_url = ("datasets/abalone.data")
abalone_df = pd.read_csv(target_url)
headers = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
abalone_df.columns = headers

### Analyze the Data

In [None]:
abalone_df.head()

In [None]:
abalone_df.info()

**Get target value**

In [None]:
# If you want the target values to be categorical rather than numeric, this process should be applied.

# for ix in abalone_df.index:
#     row = abalone_df.loc[ix]
#     if row["Rings"] <= 8:
#         abalone_df.loc[ix, 'Rings'] = 'Young'
#     elif row["Rings"] >= 11:
#         abalone_df.loc[ix, 'Rings'] = 'Old'
#     elif row["Rings"] >=9 & row["Rings"] <= 10:
#         abalone_df.loc[ix, 'Rings'] = 'Medium'
        
# X = abalone.drop(columns="Rings")
# y = abalone["Rings"]
# X['Sex'] = abalone['Sex'].replace({'F':0,'M':1,'I':2})

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
def is_numeric_value(x):
    return type(x) == int or type(x) == float

In [None]:
abalone_list = abalone_df.values.tolist()
split_index = int(0.3 * len(abalone_list))

test = abalone_list[:split_index:]
train = abalone_list[split_index:]

In [None]:
class MyQuestioner:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def __repr__(self):
        status = "=="
        if is_numeric_value(self.value):
            status = ">="
        return f"Is {headers[self.column]} {status} {self.value}"

    def compare(self, compared):
        val = compared[self.column]
        if is_numeric_value(val):
            return val >= self.value
        else:
            return val == self.value

In [None]:
print(MyQuestioner(0, "F"))
print(MyQuestioner(0, 19))

print(MyQuestioner(2, 19))
print(MyQuestioner(2, "19"))

In [None]:
compared = train[0]
q = MyQuestioner(0, "M")
print(q.compare(compared))

q = MyQuestioner(0, 12)
print(q.compare(compared))

In [None]:
def partitioner(rows, myQuestion):
    true_rows, false_rows = [], []
    for row in rows:
        if myQuestion.compare(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [None]:
true_rows, false_rows = partitioner(train, MyQuestioner(0, 'M'))
true_rows

In [None]:
false_rows

In [None]:
def class_counts(rows):
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        label = row[-1] # in our dataset format, the label is always the last column
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [None]:
counts = class_counts(train)
print(counts)

In [None]:
def gini_impurity(rows): # There are some ways like entropy, but I use gini impurity : 
    #  https://medium.com/machine-learning-t%C3%BCrkiye/karar-agaclari-algoritmasi-b823c23997d0#:~:text=Gini%20impurity%20nedir,%C3%B6l%C3%A7mek%20i%C3%A7in%20kullan%C4%B1labilir.
    
    impurityValue = 1
    counts = class_counts(rows)
    
    for label in counts:
        probability_of_label = counts[label] / float(len(rows))
        impurityValue -= probability_of_label ** 2
        
    return impurityValue

In [None]:
current_uncertainty = gini_impurity(train)
current_uncertainty

In [None]:
def information_gain(left, right, current_uncertainty):
    # Information Gain: The uncertainty of the starting node, minus the weighted impurity of two child nodes.

    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - (1 - p) * gini_impurity(right) - p * gini_impurity(left)

In [None]:
# How much information do we gain by partioning on 'F (Female)' ?
true_rows, false_rows = partitioner(train, MyQuestioner(0, "F"))
information_gain(true_rows, false_rows, current_uncertainty)

In [None]:
def find_the_best_split(rows):
    # Find the best question to ask by iterating over every feature / value and calculating the information gain.
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    n_features = len(rows[0]) - 1  # number of columns
    current_uncertainty = gini_impurity(rows)

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = MyQuestioner(col, val)          
            true_rows, false_rows = partitioner(rows, question)  # try splitting the dataset

            if len(true_rows) == 0 or len(false_rows) == 0: # Skip this split if it doesn't divide the dataset.
                continue

            gain = information_gain(true_rows, false_rows, current_uncertainty) # Calculate the information gain from this split

            # You actually can use '>=' instead of '>' here, but I wanted the tree to look a certain way for our dataset.
            if gain > best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [None]:
best_gain, best_question = find_the_best_split(train)
best_question

In [None]:
class Leaf:
    # A Leaf node classifies data: so this holds a dictionary of class ( for us Old, Mediun or Young) 
    #   -> number of times it appears in the rows from the training data that reach this leaf.

    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [None]:
class Decision_Node:
    # A Decision Node asks a question: This holds a reference to the question, and to the two child nodes.

    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [None]:
def build_my_tree(rows, max_depth, attribute_types, curr_depth=0):
    """Builds the tree.
    
    Rules of recursion: 
        2) Start by checking for the base case (no further information gain). 
        3) Prepare for giant stack traces.
    """
    
    if (curr_depth <= max_depth):
        # Try partitioing the dataset on each of the unique attribute,
        # calculate the information gain,
        # and return the question that produces the highest gain.
        gain, question = find_the_best_split(rows)
    
        # Base case: no further info gain
        # Since we can ask no further questions,
        # we'll return a leaf.
        if gain == 0:
            return Leaf(rows)
    
        true_rows, false_rows = partitioner(rows, question)
        true_branch = build_my_tree(true_rows, max_depth, attribute_types, curr_depth + 1) # Recursively build the true branch.
        false_branch = build_my_tree(false_rows, max_depth, attribute_types, curr_depth + 1) # Recursively build the false branch.
    
        # Return a Decision_Node.
        # This records the best feature / value to ask at this point, as well as the branches to follow depending on the answer.
    
        return Decision_Node(question, true_branch, false_branch)
    
    return Leaf(rows)

In [None]:
def print_tree(node, spacing=""):
    """World's most elegant tree printing function."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [None]:
def build_dt(X, y, attribute_types, options):
    # rows = [sublist + [y[i % len(y)]] for i, sublist in enumerate(X)]
    
    rows = np.concatenate((X, y), axis=1).tolist()
    print(options["max_depth"])
    return build_my_tree(rows, options["max_depth"], attribute_types)

In [0]:
class_counts(train)

In [None]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.compare(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [None]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [None]:
X = abalone_df.iloc[:, :-1].values
y = abalone_df.iloc[:, -1].values.reshape(-1,1)

options = {"max_depth":2}
attribute_types = abalone_df.dtypes.apply(str).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=41)

my_tree = build_dt(X_train, y_train, attribute_types, options)
print_tree(my_tree)

In [None]:
for row in test:
    print ("Actual: %s. Predicted: %s" % (row[-1], print_leaf(classify(row, my_tree))))