In [None]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.2-py2.py3-none-any.whl (7.0 kB)
Collecting jedi>=0.16 (from ipython->ipython-autotime)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m


## Feature analysis by consecutive random forest

In [None]:
#importing libreries
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

seed = 42

### In this notebook, an analysis of the importance of features will be carried out by computing sequential random forests where at each iteration the feature with the least importance will be dropped. The analysis is inclusive of all columns in the dataset including those which are not present at submit time

#### Taking data and pre-processing features that are not numerical

In [None]:
def show_columns_type(data):
    for column in data.columns:
        print(column, "=", type(column))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_parquet("/content/drive/MyDrive/job_table.parquet")
#data = pd.read_parquet("job_table.parquet")
data = data[0:1000000]
data.head(10)

In [None]:
data.info()

There are 'category', 'datetime' and 'object' features

In [None]:
show_columns_type(data)

All elements inside colunms are strings

Deleting rows classified with usless class for this task

In [None]:
values = ["OUT_OF_MEMORY", "COMPLETED", "FAILED", "TIMEOUT"]
data = data[data['job_state'].isin(values)]

Considering only two classes: completed job and others as failed.
Then encode job exit state as numerical for binary classification task

In [None]:
labels = data["job_state"]
data = data.drop(["job_state"], axis=1)

print("Old labels: ", np.unique(labels))

labels[labels == "OUT_OF_MEMORY"] = "FAILED"
labels[labels == "TIMEOUT"] = "FAILED"

print("New labels: ", list(np.unique(labels)))

lab_enc = LabelEncoder()
lab_enc.fit(labels)
unique_labels = lab_enc.classes_
num_classes = len(unique_labels)

labels = lab_enc.transform(labels).astype(float)

In [None]:
labels

Converting all columns to numerical

In [None]:
ord_enc = OrdinalEncoder(dtype=np.int64)
for col in tqdm(data.columns):
    val = data[col].values.reshape(-1, 1)
    data[col] = ord_enc.fit_transform(val)

In [None]:
data.info()

Final Data

In [None]:
data.head(5)

#### Random Forest Analysis

#### Train, validation and test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=0.7, random_state=seed)

#### RF consecutive

In [None]:
features_ranking = {}

In [None]:
def random_forest():
    local_ranking = {}
    rf = RandomForestClassifier(random_state=seed)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary")
    print("-------------------------------------------------------------------")
    print(f"F1 on test set: {f1}")
    print(f"Recall on test set: {recall}")
    print(f"Precision on test set: {precision}")
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm).plot()
    importances = pd.Series(rf.feature_importances_, index=x_train.columns)
    importances.sort_values(ascending=True, inplace=True)
    i = 0
    for key in importances.index:
      local_ranking.update({key : importances[i]})
      i = i + 1
    print("Local ranking: ", local_ranking)
    print("-------------------------------------------------------------------")
    return local_ranking

In [None]:
for i in tqdm(range(0,99)):
  print("Iteration number: ", i)
  local_ranking = random_forest()
  column_to_drop = list(local_ranking.keys())[0]
  features_ranking.update({(99-i) : column_to_drop})
  x_train.drop(column_to_drop, axis='columns', inplace=True)
  x_test.drop(column_to_drop, axis='columns', inplace=True)