# IMPORTS AND CONSTANTS

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler as Scaler
from functools import partial
from pycm import ConfusionMatrix

dataset_filepath = "parti.csv"
index_header = 'url'
non_empty_columns = ("Predicted_COICOP_5_Category", "Predicted_COICOP_4_Category", "Predicted_COICOP_3_Category", "Predicted_COICOP_2_Category",
                     "Coicop5_Final", "Coicop4_Final", "Coicop3_Final", "Coicop2_Final")

predicted_target_pairs = [('Predicted_COICOP_5_Category', 'Coicop5_Final'),
                          ('Predicted_COICOP_4_Category', 'Coicop4_Final'),
                          ('Predicted_COICOP_3_Category', 'Coicop3_Final'),
                          ('Predicted_COICOP_2_Category', 'Coicop2_Final'),
                         ]
level_weights = {'Coicop2_Final': 0.05,
                 'Coicop3_Final': 0.15,
                 'Coicop4_Final': 0.4,
                 'Coicop5_Final': 0.4}


# FUNCTION DEFINITIONS

In [9]:
def clean_empty_cells(df: pd.DataFrame) -> pd.DataFrame:
    """Replace all empty cells with pd.np.nan"""
    return df.applymap(lambda x: pd.np.nan if isinstance(x, str)
                       and x.lower().strip() in ['nan', ''] else x)

def drop_duplicate_rows(df: pd.DataFrame, index) -> pd.DataFrame:
    """Drop duplicate rows by index"""
    return df.drop_duplicates(subset=index)

def drop_empty_rows(df: pd.DataFrame,
                  required_rows: tuple=()) -> pd.DataFrame:
    """Remove row without the required data points""" 
    df = df.dropna(subset=non_empty_columns, how='any')
    return df

def calc_score_for_level(confusion_matrix: ConfusionMatrix,
                         balance: bool) -> float:
    """Unified metric for a single level multiclass classifier
    
    :param balance: If True, f1 score is normalized by class frequency
    """
    n_classes = confusion_matrix.classes.__len__()
    scores = []
    if not balance:
        score = confusion_matrix.F1_Macro
    else:
        ratios = []
        for cls in confusion_matrix.classes:
            population = confusion_matrix.POP[cls]
            score = confusion_matrix.F1[cls]
            if balance:
                cls_count = confusion_matrix.P[cls]
                if cls_count == 0:
                    score = 0
                    balance_ratio = 0
                    ratios.append(balance_ratio)
                else:
                    balance_ratio = population / (n_classes * cls_count)
                    ratios.append(balance_ratio)
#                     score *= balance_ratio
            scores.append(score)
        ratios = [i/sum(ratios) for i in ratios]
        print(ratios)
        scores = [i*j for i, j in zip(scores, ratios)]
#         scores = np.expand_dims(np.array(scores), 1)
#         scores = Scaler().fit_transform(scores)
#         score = scores.ravel().mean()
        score = sum(scores)
    return score

# READ THE DATASET

In [10]:
dataset_cleaned_up = False
df = pd.read_csv(dataset_filepath, dtype=str)
df.head(2)

Unnamed: 0.1,Unnamed: 0,category,url,product_name,product_description,product_id_store,product_id,volume,qty,unit,...,reduction,product,Coicop5_Final,Coicop4_Final,Coicop3_Final,Coicop2_Final,Predicted_COICOP_5_Category,Predicted_COICOP_4_Category,Predicted_COICOP_3_Category,Predicted_COICOP_2_Category
0,12785,Nahrungsmittel Cerealien & Müsli Frucht- & Müs...,https://shop.rewe.de/p/viba-fruchtschnitte-ban...,Viba Fruchtschnitte Banane Apfel 35g,,p/viba-fruchtschnitte-banane-apfel-35g/2369187,5f1e62d5db71d23762ef220207d002cd,"35g (100 g = 2,54 €)",35,gram,...,False,https://shop.rewe.de/p/viba-fruchtschnitte-ban...,1117,111,11,1,1117,111,11,1
1,990,"Wein, Spirituosen & Tabak Wein Rotwein Italien",https://shop.rewe.de/p/salice-salentino-riserv...,"Salice Salentino Riserva Fiamme Nere 0,75l","- Duft nach Dörrpflaume, Waldbeere und Gewürzk...",p/salice-salentino-riserva-fiamme-nere-0-75l/5...,dceed3ba54fd55a69c63104661073c73,"0,75l (1 l = 7,72 €)",75,liter,...,False,https://shop.rewe.de/p/salice-salentino-riserv...,2121,212,21,2,2121,212,21,2


# CLEANUP DATASET

In [11]:
if not dataset_cleaned_up:
    cleanup_pipeline = [clean_empty_cells,
                        partial(drop_duplicate_rows, index=index_header),
                        partial(drop_empty_rows, required_rows=non_empty_columns)]
    for op in cleanup_pipeline:
        df = op(df)
    dataset_cleaned_up = True

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,category,url,product_name,product_description,product_id_store,product_id,volume,qty,unit,...,reduction,product,Coicop5_Final,Coicop4_Final,Coicop3_Final,Coicop2_Final,Predicted_COICOP_5_Category,Predicted_COICOP_4_Category,Predicted_COICOP_3_Category,Predicted_COICOP_2_Category
0,12785,Nahrungsmittel Cerealien & Müsli Frucht- & Müs...,https://shop.rewe.de/p/viba-fruchtschnitte-ban...,Viba Fruchtschnitte Banane Apfel 35g,,p/viba-fruchtschnitte-banane-apfel-35g/2369187,5f1e62d5db71d23762ef220207d002cd,"35g (100 g = 2,54 €)",35,gram,...,False,https://shop.rewe.de/p/viba-fruchtschnitte-ban...,1117,111,11,1,1117,111,11,1
1,990,"Wein, Spirituosen & Tabak Wein Rotwein Italien",https://shop.rewe.de/p/salice-salentino-riserv...,"Salice Salentino Riserva Fiamme Nere 0,75l","- Duft nach Dörrpflaume, Waldbeere und Gewürzk...",p/salice-salentino-riserva-fiamme-nere-0-75l/5...,dceed3ba54fd55a69c63104661073c73,"0,75l (1 l = 7,72 €)",75,liter,...,False,https://shop.rewe.de/p/salice-salentino-riserv...,2121,212,21,2,2121,212,21,2
2,7446,Obst & Gemüse Gemüse Hülsenfrüchte,https://shop.rewe.de/p/kaiserschoten-200g-scha...,Kaiserschoten 200g Schale,"Simbabwe, Kenia, Ägypten, Guatemala, Peru,Tans...",p/kaiserschoten-200g-schale/140419,faee20a1c6712c84d01505bf2b1fe0d4,"200g (100 g = 1,15 €)",200,gram,...,False,https://shop.rewe.de/p/kaiserschoten-200g-scha...,1171,117,11,1,1171,117,11,1
3,18631,Nahrungsmittel Brot & Backwaren Brotwaren Schn...,https://shop.rewe.de/p/rewe-bio-karotten-walnu...,REWE Bio Karotten Walnuss Brot 500g,,p/rewe-bio-karotten-walnuss-brot-500g/883838,931b38b6833c084c898fd9455d45e0e3,"500g (1 kg = 4,58 €)",500,gram,...,False,https://shop.rewe.de/p/rewe-bio-karotten-walnu...,1113,111,11,1,1113,111,11,1
4,1391,"Wein, Spirituosen & Tabak Tabak & Zigaretten Z...",https://shop.rewe.de/p/l-m-blue-label-xxl-28-s...,L&M Blue Label XXL 28 Stück,,p/l-m-blue-label-xxl-28-stueck/7266830,4de29bdc7f42991b6547c777d437c304,28 Stück,28,,...,False,https://shop.rewe.de/p/l-m-blue-label-xxl-28-s...,2201,220,22,2,2201,220,22,2


# COLLECT PERFORMANCE STATS

In [13]:
confusion_matrices = {actual: ConfusionMatrix(actual_vector=df[actual].values,
                                              predict_vector=df[predicted].values)
                      for predicted, actual in predicted_target_pairs}

# CALCULATE SCORES

In [14]:
score_levels = {level: calc_score_for_level(cm, balance=True) for  level, cm in confusion_matrices.items()}
final_score = sum([level_weights[level] * value for level, value in score_levels.items()])
print("The final score is {final_score}".format(final_score=final_score))

[0.16013344453711426, 0.8398665554628858]
[0.01630019378055986, 0.05433397926853287, 0.0900800182609887, 0.040271066987265534, 0.15559275881443502, 0.07283065306207596, 0.038033785487973, 0.030562863338549736, 0.02516941686704096, 0.03761583180129198, 0.030292395521394427, 0.08348879741262367, 0.032913852826130484, 0.15559275881443502, 0.13692162775670283]
[0.023886496484236974, 0.094023807239423, 0.11485542920264845, 0.7672342670736917]
[0.0038034194644536536, 0.006569542711329038, 0.008029441091624378, 0.0013139085422658077, 0.012044161637436568, 0.0013382401819373964, 0.0021254402889593947, 0.003284771355664519, 0.008029441091624378, 0.0072264969824619414, 0.07226496982461941, 0.010323567117802773, 0.07226496982461941, 0.002408832327487314, 0.014452993964923883, 0.07226496982461941, 0.014452993964923883, 0.009033121228077427, 0.003011040409359142, 0.010323567117802773, 0.024088323274873136, 0.014452993964923883, 0.0034411890392675912, 0.004516560614038713, 0.0027794219163315157, 0.0