In [1]:
# 1) IMPORTS
import pandas as pd
import numpy as np
import pickle, string, math
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from collections import Counter

# 2) FEATURE EXTRACTION FUNCTION
def shannon_entropy(domain):
    prob = [float(domain.count(c)) / len(domain) for c in dict.fromkeys(list(domain))]
    return - sum([p * math.log2(p) for p in prob])

def extract_features(domain):
    features = {}
    features['length'] = len(domain)
    features['num_digits'] = sum(c.isdigit() for c in domain)
    features['num_letters'] = sum(c.isalpha() for c in domain)
    features['num_hyphens'] = domain.count('-')
    features['num_dots'] = domain.count('.')
    features['entropy'] = shannon_entropy(domain)
    features['digit_ratio'] = features['num_digits'] / (features['length'] + 1e-5)
    features['letter_ratio'] = features['num_letters'] / (features['length'] + 1e-5)
    features['num_levels'] = domain.count('.') + 1
    splits = domain.split('.')
    features['max_segment_len'] = max(len(s) for s in splits)
    features['min_segment_len'] = min(len(s) for s in splits)
    vowels = set('aeiou')
    features['num_vowels'] = sum(c in vowels for c in domain.lower())
    features['vowel_ratio'] = features['num_vowels'] / (features['length'] + 1e-5)
    features['starts_num'] = int(domain[0].isdigit())
    features['ends_num'] = int(domain[-1].isdigit())
    features['repeating_char_ratio'] = sum((Counter(domain)[c] > 1) for c in domain)/(len(set(domain))+1e-5)
    features['alpha_numeric_ratio'] = (features['num_digits']+features['num_letters'])/(features['length']+1e-5)
    features['special_char_ratio'] = (features['length']-(features['num_letters']+features['num_digits']))/(features['length']+1e-5)
    features['mean_pos_digit'] = np.mean([i for i,c in enumerate(domain) if c.isdigit()]+[0])
    features['mean_pos_letter'] = np.mean([i for i,c in enumerate(domain) if c.isalpha()]+[0])
    features['std_pos_letter'] = np.std([i for i,c in enumerate(domain) if c.isalpha()]+[0])
    features['std_pos_digit'] = np.std([i for i,c in enumerate(domain) if c.isdigit()]+[0])
    features['is_ip_like'] = int(all(c in string.digits+'.' for c in domain))
    return list(features.values())

# 3) SELECTIVE DEEP FOREST CLASS
class SelectiveDeepForest:
    def __init__(self, n_layers=5, n_estimators=100, top_ratio=0.7):
        self.n_layers = n_layers
        self.n_estimators = n_estimators
        self.top_ratio = top_ratio
        self.layers = []
    def _select_trees(self, forest, X, y):
        aucs=[]
        for i,tree in enumerate(forest.estimators_):
            prob=tree.predict_proba(X)[:,1]
            aucs.append((roc_auc_score(y, prob),i))
        aucs.sort(reverse=True)
        top_k=int(len(aucs)*self.top_ratio)
        return [forest.estimators_[i] for (auc,i) in aucs[:top_k]]
    def fit(self, X,y):
        Xc=X
        for _ in range(self.n_layers):
            rf=RandomForestClassifier(n_estimators=self.n_estimators)
            rf.fit(Xc,y)
            selected=self._select_trees(rf,Xc,y)
            self.layers.append(selected)
            proba=rf.predict_proba(Xc)
            Xc=np.hstack((Xc,proba))
    def predict_proba(self,X):
        Xc=X
        for trees in self.layers:
            probs = np.mean([t.predict_proba(Xc) for t in trees],axis=0)
            Xc=np.hstack((Xc,probs))
        return probs
    def predict(self,X):
        return (self.predict_proba(X)[:,1]>=0.5).astype(int)

In [2]:
df = pd.read_csv("domain_classification_dataset.csv")
X = np.vstack(df['Domain'].apply(extract_features).values)
y = df['Label'].values

model = SelectiveDeepForest(n_layers=5,n_estimators=100,top_ratio=0.7)
model.fit(X,y)

pickle.dump(model,open("model.pkl","wb"))
print("Training complete & model.pkl saved ✔")

Training complete & model.pkl saved ✔
