In [1]:
![ ! -d "ember" ] && git clone https://github.com/elastic/ember && cd ember && pip install -r requirements.txt && python setup.py install

# Restart kernel now to use ember package

In [2]:
import ember

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
## Enter any data directory where you want to store all the data that will be downloaded and models that will be generated
main_data_dir = "ember/data"
if not os.path.exists(main_data_dir):
    os.makedirs(main_data_dir)
data_dir = "ember/data/ember2018"

In [5]:
## IMPORTANT
## It will download dataset if not already downloaded. The dataset is of size ~10GB and may take a lot of time to get downloaded.

import requests, tarfile
if not os.path.exists(os.path.join(data_dir, "X_train.dat")):
    dataset_link = "https://ember.elastic.co/ember_dataset_2018_2.tar.bz2"
    target_file = os.path.join(os.path.dirname(data_dir), "ember_dataset_2018_2.tar.bz2")
    res = requests.get(dataset_link, stream=True)
    if res.status_code == 200:
        with open(target_file, 'wb') as f:
            f.write(res.raw.read())
        
        tar = tarfile.open(target_file, "r:bz2")
        tar.extractall(os.path.dirname(data_dir))
        tar.close()

        ember.create_vectorized_features(data_dir)
        _ = ember.create_metadata(data_dir)
    else:
        print("Unable to download zip data file")

In [6]:
# Reading generated vectorized features stored in .dat files as numpy memmap object to efficiently read large file.

def read_vectorized_features(data_dir, subset=None, feature_version=2):
    """
    Read vectorized features into memory mapped numpy arrays
    """
    if subset is not None and subset not in ["train", "test"]:
        return None

    ndim = 2381
    X_train = None
    y_train = None
    X_test = None
    y_test = None

    if subset is None or subset == "train":
        X_train_path = os.path.join(data_dir, "X_train.dat")
        y_train_path = os.path.join(data_dir, "y_train.dat")
        y_train = np.memmap(y_train_path, dtype=np.float32, mode="r")
        N = y_train.shape[0]
        X_train = np.memmap(X_train_path, dtype=np.float32, mode="r", shape=(N, ndim))
        if subset == "train":
            return X_train, y_train

    if subset is None or subset == "test":
        X_test_path = os.path.join(data_dir, "X_test.dat")
        y_test_path = os.path.join(data_dir, "y_test.dat")
        y_test = np.memmap(y_test_path, dtype=np.float32, mode="r")
        N = y_test.shape[0]
        X_test = np.memmap(X_test_path, dtype=np.float32, mode="r", shape=(N, ndim))
        if subset == "test":
            return X_test, y_test

    return X_train, y_train, X_test, y_test

In [7]:
X_train, Y_train, X_test, Y_test = read_vectorized_features(data_dir)

In [8]:
# Removing data whose label is unknown

train_rows = (Y_train != -1)
X_train = X_train[train_rows]
Y_train = Y_train[train_rows]
print(X_train.shape, Y_train.shape)

In [None]:
# Randomly selecting total_rows/r from training dataset to reduce load on training.

from sklearn.utils import shuffle
r = 3

do_shuffle = True
if not do_shuffle:
    X_train , Y_train = X_train[:X_train.shape[0]//r], Y_train[:X_train.shape[0]//r]
else:
    X_train, Y_train = shuffle(X_train, Y_train, n_samples=len(X_train)//r)
    X_test, Y_test = shuffle(X_test, Y_test, n_samples=len(X_test)//r)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
clfs = {
    "ridge": lambda X_train, Y_train: RidgeClassifier().fit(X_train, Y_train),
    "K-neighbour": lambda X_train, Y_train: KNeighborsClassifier(1).fit(X_train, Y_train),
    "SVC": lambda X_train, Y_train: SVC(gamma=2, C=1).fit(X_train, Y_train),
    "GPC": lambda X_train, Y_train: GaussianProcessClassifier(1.0 * RBF(1.0)).fit(X_train, Y_train),
    "Decision Tree": lambda X_train, Y_train: DecisionTreeClassifier(max_depth=10).fit(X_train, Y_train),
    "Random Forest": lambda X_train, Y_train: RandomForestClassifier(max_depth=20, n_estimators=200).fit(X_train, Y_train),
    "MLP": lambda X_train, Y_train: MLPClassifier(alpha=1, max_iter=100).fit(X_train, Y_train),
    "ADA Boost": lambda X_train, Y_train: AdaBoostClassifier().fit(X_train, Y_train),
    "Gaussian Naive Bayesian": lambda X_train, Y_train: AdaBoostClassifier().fit(X_train, Y_train),
    "Quadratic Discriminant Analysis": lambda X_train, Y_train: QuadraticDiscriminantAnalysis().fit(X_train, Y_train)
}

In [None]:
best_clfs = {}
best_clfs_num = 3

def add_best_clf(name, clf, key_param):
    kp = key_param(clf)
    print(f"model: {name}, score: {kp}")
    clf["kp"] = kp
    if len(best_clfs) < best_clfs_num:
        best_clfs[name] = clf
    else:
        change_clf = None
        for best_clf_name, best_clf_val in best_clfs.items():
            if key_param(best_clf_val) < kp:
                change_clf = best_clf_name
        
        if change_clf is not None:
            del best_clfs[change_clf]
            best_clfs[name] = clf

In [None]:
## Running different models for the dataset. For large data it will take a lot of time and ram. Might give error if computer does not have adequate resources.

for clf_name, clf in clfs.items():
    model = clf(X_train, Y_train)
    score = model.score(X_test, Y_test)
    add_best_clf(clf_name, {
        "model": model,
        "score": score
    },
    lambda clf_dict: clf_dict["score"])

In [None]:
import pickle

save_model = True
save_model_dir = os.path.join(data_dir, "best_models")

if not os.path.exists(save_model_dir) and save_model:
    os.makedirs(save_model_dir)

print("Best Classifiers are: ")
for clf_name, clf in best_clfs.items():
    print(f"classifier: {clf_name}, score: {clf['kp']}")

    if save_model:
        pickle.dump(clf["model"], open(os.path.join(save_model_dir, clf_name), 'wb'))

