In [1]:
import re
import numpy as np
from collections import Counter

class TreeNode:
    """A node in the decision tree."""
    def __init__(self, word=None, left=None, right=None, label=None):
        self.word = word  # Word used for splitting
        self.left = left  # Left subtree
        self.right = right  # Right subtree
        self.label = label  # Class label (if leaf node)

class DecisionTreeTextClassifier:
    def __init__(self, min_samples=1, max_depth=10):
        self.min_samples = min_samples
        self.max_depth = max_depth
        self.vocab = set()
        self.root = None

    def preprocess(self, text):
        """Tokenizes and normalizes text."""
        return re.findall(r'\b[a-z]+\b', text.lower())

    def vectorize(self, text):
        """Converts text into a bag-of-words dictionary."""
        words = self.preprocess(text)
        return {word: words.count(word) for word in self.vocab}

    def build_vocab(self, texts):
        """Builds a vocabulary from training texts."""
        for text in texts:
            self.vocab.update(self.preprocess(text))

    def gini_impurity(self, labels):
        """Calculates Gini impurity for a set of labels."""
        total = len(labels)
        if total == 0:
            return 0
        counts = Counter(labels)
        probs = [count / total for count in counts.values()]
        return 1 - sum(p**2 for p in probs)

    def best_split(self, texts, labels):
        """Finds the best word to split on by minimizing Gini impurity."""
        best_word, best_score, best_groups = None, float('inf'), None

        for word in self.vocab:
            left_texts, left_labels, right_texts, right_labels = [], [], [], []

            for text, label in zip(texts, labels):
                if word in text:
                    left_texts.append(text)
                    left_labels.append(label)
                else:
                    right_texts.append(text)
                    right_labels.append(label)

            gini = (len(left_labels) * self.gini_impurity(left_labels) +
                    len(right_labels) * self.gini_impurity(right_labels)) / len(labels)

            if gini < best_score:
                best_word, best_score, best_groups = word, gini, (left_texts, left_labels, right_texts, right_labels)

        return best_word, best_groups

    def build_tree(self, texts, labels, depth=0):
        """Recursively builds the decision tree."""
        if len(set(labels)) == 1 or len(labels) < self.min_samples or depth >= self.max_depth:
            return TreeNode(label=Counter(labels).most_common(1)[0][0])

        word, (left_texts, left_labels, right_texts, right_labels) = self.best_split(texts, labels)

        if not word or len(left_labels) == 0 or len(right_labels) == 0:
            return TreeNode(label=Counter(labels).most_common(1)[0][0])

        left = self.build_tree(left_texts, left_labels, depth + 1)
        right = self.build_tree(right_texts, right_labels, depth + 1)
        return TreeNode(word=word, left=left, right=right)

    def fit(self, X_train, y_train):
        """Trains the decision tree model."""
        self.build_vocab(X_train)
        X_train = [self.preprocess(text) for text in X_train]
        self.root = self.build_tree(X_train, y_train)

    def predict_one(self, text):
        """Predicts the class of a single text by traversing the tree."""
        node = self.root
        words = set(self.preprocess(text))

        while node.word:
            node = node.left if node.word in words else node.right

        return node.label

    def predict(self, X_test):
        """Predicts class labels for a list of texts."""
        return [self.predict_one(text) for text in X_test]


# Example Dataset
X_train = ["AI revolution in technology", "Latest machine learning update",
           "Dog food and health", "Best pet care tips"]
y_train = ["tech", "tech", "pet", "pet"]

X_test = ["AI model advancements", "Healthy dog food choices"]

# Train and Predict
tree_model = DecisionTreeTextClassifier()
tree_model.fit(X_train, y_train)
predictions = tree_model.predict(X_test)

print("Predictions:", predictions)

Predictions: ['tech', 'pet']


In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
    
    def gini_impurity(self, y):
        classes, counts = np.unique(y, return_counts=True)
        impurity = 1
        for count in counts:
            prob = count / len(y)
            impurity -= prob ** 2
        return impurity
    
    def split_data(self, X, y, feature_idx, threshold):
        left_mask = X[:, feature_idx] <= threshold
        right_mask = ~left_mask
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]
    
    def find_best_split(self, X, y):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None
        
        for feature_idx in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_idx])
            
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self.split_data(X, y, feature_idx, threshold)
                
                if len(y_left) == 0 or len(y_right) == 0:
                    continue
                
                gini = (len(y_left) * self.gini_impurity(y_left) + 
                       len(y_right) * self.gini_impurity(y_right)) / len(y)
                
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_idx
                    best_threshold = threshold
        
        return best_feature, best_threshold