# IMPORTS AND CONSTANTS

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler as Scaler
from functools import partial
from pycm import ConfusionMatrix

dataset_filepath = "gs://ecb-fsf-hackathon-base-data/webscrap_train.csv"
index_header = 'url'
non_empty_columns = ("Predicted_COICOP_5_Category", "Predicted_COICOP_4_Category", "Predicted_COICOP_3_Category", "Predicted_COICOP_2_Category",
                     "Coicop5_Final", "Coicop4_Final", "Coicop3_Final", "Coicop2_Final")

predicted_target_pairs = [('Predicted_COICOP_5_Category', 'Coicop5_Final'),
                          ('Predicted_COICOP_4_Category', 'Coicop4_Final'),
                          ('Predicted_COICOP_3_Category', 'Coicop3_Final'),
                          ('Predicted_COICOP_2_Category', 'Coicop2_Final'),
                         ]
level_weights = {'Coicop2_Final': 0.05,
                 'Coicop3_Final': 0.15,
                 'Coicop4_Final': 0.4,
                 'Coicop5_Final': 0.4}


# FUNCTION DEFINITIONS

In [None]:
def clean_empty_cells(df: pd.DataFrame) -> pd.DataFrame:
    """Replace all empty cells with pd.np.nan"""
    return df.applymap(lambda x: pd.np.nan if isinstance(x, str)
                       and x.lower().strip() in ['nan', ''] else x)

def drop_duplicate_rows(df: pd.DataFrame, index) -> pd.DataFrame:
    """Drop duplicate rows by index"""
    return df.drop_duplicates(subset=index)

def drop_empty_rows(df: pd.DataFrame,
                  required_rows: tuple=()) -> pd.DataFrame:
    """Remove row without the required data points""" 
    df = df.dropna(subset=non_empty_columns, how='any')
    return df

def calc_score_for_level(confusion_matrix: ConfusionMatrix,
                         balance: bool) -> float:
    """Unified metric for a single level multiclass classifier
    
    :param balance: If True, f1 score is normalized by class frequency
    """
    n_classes = confusion_matrix.classes.__len__()
    scores = []
    if not balance:
        score = confusion_matrix.F1_Macro
    else:
        for cls in confusion_matrix.classes:
            population = confusion_matrix.POP[cls]
            score = confusion_matrix.F1[cls]
            if balance:
                cls_count = confusion_matrix.P[cls]
                if cls_count == 0:
                    score = 0.
                else:
                    balance_ratio = population / (n_classes * cls_count)
                    score *= balance_ratio
            scores.append(score)
        scores = np.expand_dims(np.array(scores), 1)
        scores = Scaler().fit_transform(scores)
        score = scores.ravel().mean()
    return score

# READ THE DATASET

In [None]:
dataset_cleaned_up = False
df = pd.read_csv(dataset_filepath, dtype=str)
df.head(2)

# CLEANUP DATASET

In [None]:
if not dataset_cleaned_up:
    cleanup_pipeline = [clean_empty_cells,
                        partial(drop_duplicate_rows, index=index_header),
                        partial(drop_empty_rows, required_rows=non_empty_columns)]
    for op in cleanup_pipeline:
        df = op(df)
    dataset_cleaned_up = True

# COLLECT PERFORMANCE STATS

In [None]:
confusion_matrices = {actual: ConfusionMatrix(actual_vector=df[actual].values,
                                              predict_vector=df[predicted].values)
                      for predicted, actual in predicted_target_pairs}

# CALCULATE SCORES

In [None]:
score_levels = {level: calc_score_for_level(cm, balance=True) for  level, cm in confusion_matrices.items()}
final_score = sum([level_weights[level] * value for level, value in score_levels.items()])
print("The final score is {final_score}".format(final_score=final_score))