# IMPORTS AND CONSTANTS

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler as Scaler
from functools import partial
from pycm import ConfusionMatrix

dataset_filepath = "Food-Unique-Products-Classif-2019-07-09.xlsx"
index_header = 'url'
non_empty_columns = ("Coicop5_Suggested", "Coicop4_Suggested", "Coicop3_Suggested", "Coicop2_Suggested",
                     "Coicop5_Final", "Coicop4_Final", "Coicop3_Final", "Coicop2_Final")

predicted_target_pairs = [('Coicop5_Suggested', 'Coicop5_Final'),
                          ('Coicop4_Suggested', 'Coicop4_Final'),
                          ('Coicop3_Suggested', 'Coicop3_Final'),
                          ('Coicop2_Suggested', 'Coicop2_Final'),
                         ]
level_weights = {'Coicop2_Final': 0.05,
                 'Coicop3_Final': 0.1,
                 'Coicop4_Final': 0.5,
                 'Coicop5_Final': 0.35}


# FUNCTION DEFINITIONS

In [24]:
def clean_empty_cells(df: pd.DataFrame) -> pd.DataFrame:
    """Replace all empty cells with pd.np.nan"""
    return df.applymap(lambda x: pd.np.nan if isinstance(x, str)
                       and x.lower().strip() in ['nan', ''] else x)

def drop_duplicate_rows(df: pd.DataFrame, index) -> pd.DataFrame:
    """Drop duplicate rows by index"""
    return df.drop_duplicates(subset=index)

def drop_empty_rows(df: pd.DataFrame,
                  required_rows: tuple=()) -> pd.DataFrame:
    """Remove row without the required data points""" 
    df = df.dropna(subset=non_empty_columns, how='any')
    return df

def calc_score_for_level(confusion_matrix: ConfusionMatrix,
                         balance: bool) -> float:
    """Unified metric for a single level multiclass classifier
    
    :param balance: If True, f1 score is normalized by class frequency
    """
    n_classes = confusion_matrix.classes.__len__()
    scores = []
    if not balance:
        score = confusion_matrix.F1_Macro
    else:
        for cls in confusion_matrix.classes:
            population = confusion_matrix.POP[cls]
            score = confusion_matrix.F1[cls]
            if balance:
                cls_count = confusion_matrix.P[cls]
                if cls_count == 0:
                    score = 0.
                else:
                    balance_ratio = population / (n_classes * cls_count)
                    score *= balance_ratio
            scores.append(score)
        scores = np.expand_dims(np.array(scores), 1)
        scores = Scaler().fit_transform(scores)
        score = scores.ravel().mean()
    return score

# READ THE DATASET

In [25]:
dataset_cleaned_up = False
df = pd.read_excel(dataset_filepath, dtype=str)
df.head(2)

Unnamed: 0,category,url,product_name,product_description,product_id_store,product_id,volume,qty,unit,price,...,Coicop5_Suggested,Coicop4_Suggested,Coicop3_Suggested,Coicop2_Suggested,Coicop5_Final,Coicop4_Final,Coicop3_Final,Coicop2_Final,Controversial_Classification,Sample_Indicator
0,"Wein, Spirituosen & Tabak Spirituosen & -misch...",https://shop.rewe.de/p/siderit-gingerlime-lond...,Siderit Gingerlime London Dry Gin 700ml,"Siderit Gingerlime ist ein Citric Gin, der in ...",p/siderit-gingerlime-london-dry-gin-700ml/SIAE...,cf166b3dc4aef0f0ea0226566017b8a3,"0,7 L (1 L = 68,77 €)",7,liter,48.14,...,2111,211,21,2,2111,211,21,2,,1
1,"Wein, Spirituosen & Tabak Wein Rotwein Frankreich",https://shop.rewe.de/p/ch-teau-haut-terre-fort...,Château Haut Terre Fort rouge Bordeaux trocken...,Weinfreunde.de empfiehlt: Château Haut-Terre-F...,p/ch-teau-haut-terre-fort-rouge-bordeaux-trock...,cb30ae128226bcd945cef02927fd558a,"0,75l (1 l = 8,67 €)",75,liter,6.5,...,2121,212,21,2,2121,212,21,2,,1


# CLEANUP DATASET

In [26]:
if not dataset_cleaned_up:
    cleanup_pipeline = [clean_empty_cells,
                        partial(drop_duplicate_rows, index=index_header),
                        partial(drop_empty_rows, required_rows=non_empty_columns)]
    for op in cleanup_pipeline:
        df = op(df)
    dataset_cleaned_up = True

# COLLECT PERFORMANCE STATS

In [27]:
confusion_matrices = {actual: ConfusionMatrix(actual_vector=df[actual].values,
                                              predict_vector=df[predicted].values)
                      for predicted, actual in predicted_target_pairs}

# CALCULATE SCORES

In [30]:
score_levels = {level: calc_score_for_level(cm, balance=True) for  level, cm in confusion_matrices.items()}
final_score = sum([level_weights[level] * value for level, value in score_levels.items()])
print("The final score is {final_score}".format(final_score=final_score))

The final score is 0.25057409359275545
