In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
import pandas as pd
import math

import sys
sys.path.append("../..")
from src.load_datasets import load_dataset, load_rankings, load_train_data
import src.evaluate_regression

# Load data

Do not read preprocessed data, because then I cannot generate the indices for the CV. 

In [3]:
# Define variables for ranking
factors = ["dataset", "model", "tuning", "scoring"]
new_index = "encoder"
target = "rank"

In [7]:
df_train = load_dataset("../../data/raw/dataset_rank_train.csv")

if "cv_score" in df_train.columns:
    df_train = df_train.drop("cv_score", axis=1)

X_train = df_train.drop(target, axis=1)
y_train = df_train[target]

print(X_train.shape)
X_train.head()

Loading data from '../../data/raw/dataset_rank_train.csv' ...
(36054, 5)


Unnamed: 0,dataset,model,tuning,scoring,encoder
0,1169,KNC,model,ACC,BUCV2RGLMME
1,1169,KNC,model,ACC,BUCV2TE
2,1169,KNC,model,ACC,CBE
3,1169,KNC,model,ACC,CE
4,1169,KNC,model,ACC,CV10RGLMME


In [8]:
cv_indices = src.evaluate_regression.custom_cross_validated_indices(pd.concat([X_train, y_train], axis=1), 
                                                                    factors, 
                                                                    target, 
                                                                    n_splits=5, 
                                                                    shuffle=True, 
                                                                    random_state=1444)

In [9]:
print(cv_indices[0])

[0        16858
1        16859
2        16860
3        16861
4        16862
         ...  
28871     6434
28872     6435
28873     6436
28874     6437
28875     6438
Name: index, Length: 28876, dtype: int64, 0        6502
1        6503
2        6504
3        6505
4        6506
        ...  
7173    24563
7174    24564
7175    24565
7176    24566
7177    24567
Name: index, Length: 7178, dtype: int64]


# Preprocess data

In [10]:
import src.encoding
from src.feature_engineering import normalize_train_data, normalize_test_data
from src.meta_information import add_dataset_meta_information

In [11]:
# OHE encoding 
X_train, ohe = src.encoding.ohe_encode_train_data(X_train=X_train,
                                                  cols_to_encode=["model", "tuning", "scoring"],
                                                  verbosity=2)

One Hot Encoding the features ['model', 'tuning', 'scoring'] of the train data ...


In [12]:
# Encoder encoding: Poincare Embeddings for feature "encoder"
X_train, _ = src.encoding.poincare_encoding(path_to_graph="../../data/raw/graph.adjlist",
                                            path_to_embeddings="../../data/preprocessed/embeddings.csv",
                                            data=X_train,
                                            column_to_encode="encoder",
                                            encode_dim=50,
                                            explode_dim=True,
                                            epochs=5000,
                                            dim_reduction=None,
                                            verbosity=2)

(Poincare) Embedding the graph ...
Saving the embeddings to '../../data/preprocessed/embeddings.csv'...
Encoding the data feature 'encoder'...


In [13]:
X_train = add_dataset_meta_information(df=X_train,
                                       path_to_meta_df="../../data/preprocessed/dataset_agg.csv",
                                       nan_threshold=0.4,
                                       replacing_strategy="median")

In [14]:
X_train, scaler = normalize_train_data(X_train=X_train, 
                                       method="minmax",
                                       verbosity=2)

Normalizing train data using method 'minmax' ...


In [15]:
print(X_train.shape)

(36054, 112)


# Multiclass classification

In [None]:
from sklearn.metrics import matthews_corrcoef

def manual_cv(indices=None, X=None, y=None):
    for fold in indices: 
        X_train = X.loc[fold[0]]
        y_train = y.loc[fold[0]]
        
        X_test = X.loc[fold[1]]
        y_test = y.loc[fold[1]]
        
        model = RandomForestClassifier(random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        score = matthews_corrcoef(y_test, preds)
        
        print(f"PREDICTIONS : {preds[:10]}")
        print(f"Ground truth: {list(y_test[:10])}")
        
        print(f"MCC: {round(score, 4)}")
    

In [None]:
%%time

# Use the labels as they are
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

model = RandomForestClassifier(random_state=42)

cv_results = cross_validate(estimator=model, 
                            X=X_train, 
                            y=y_train,
                            cv=cv_indices, 
                            scoring=["matthews_corrcoef", "accuracy"],  # , "f1"
                            n_jobs=-1, 
                            return_train_score=True)

In [None]:
for scorer in ["matthews_corrcoef", "accuracy"]:
    print(f"CV Test {scorer}: {round(cv_results[f'test_{scorer}'].mean(), 4)} +/- {round(cv_results[f'test_{scorer}'].std(), 4)}")