### Load 500 examples of EURIPO trademark court decisions

* Columns: 'mark', 'earlier_mark', 'target_label'
* 250 examples of NLOC (target_label == 0), 250 examples of LOC (target_label == 1)

In [13]:
import pandas as pd

data_set = pd.read_csv('../500_tm_loc_decisions.csv', index_col=0)
data_set.head()
# data_set.query('mark == "ASTEX"')

Unnamed: 0,earlier_mark,mark,target_label
2373,MILEI,MILET,1
4478,RIBENA,RUBINO ROSSO,0
5274,REDIHALER,EFFIHALER,0
6153,MIESZKO CHERRISSIMO,CHERRISTO,0
5793,YONDELIS,YLOELIS,1


### Model builder class

* Instantiate with a data source, a features dict (key = name, value = transform function) and an instantiated classifer algorithm

In [14]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import auc, roc_curve, roc_auc_score, f1_score, recall_score, precision_score, confusion_matrix

class ModelBuilder:
    
    def __init__(self, data_source, features, algorithm):
        self.data_source = data_source
        self.features = features
        self.algorithm = algorithm
        self.metrics = {}
        
    def build(self):
        # Build the model features ...
        for feature_key, feature_func in self.features.items():
            print("Generating %s ... " % feature_key)
            for _index, row in self.data_source.iterrows():
                self.data_source.at[_index, feature_key] = feature_func(row['mark'], row['earlier_mark'])
            
        X = self.data_source[self.features.keys()]
        y = self.data_source['target_label']
        
        train_dataset, test_dataset, train_labels, test_labels = train_test_split(
            X, y, test_size=0.25, random_state=0)
        
        pipeline = Pipeline([
            ('scaler', preprocessing.MinMaxScaler()),
            ('clf', self.algorithm)
        ])
        
        pipeline.fit(train_dataset, train_labels)
        
        y_score = pipeline.predict(test_dataset)

        cm = confusion_matrix(test_labels, y_score, labels=[1,0])
        
        cm_dict = {
            'TP': int(cm[0][0]),
            'FP': int(cm[0][1]),
            'FN': int(cm[1][0]),
            'TN': int(cm[1][1])
        }
        
        fpr, tpr, thresholds = roc_curve(test_labels, y_score)
        self.metrics = {
            'f1_score': round(f1_score(test_labels, y_score), 4),
            'precision_score': float(precision_score(test_labels, y_score)),
            'recall_score': float(recall_score(test_labels, y_score)),
#             'roc_auc': float(roc_auc_score(test_labels, y_score)),
#             'roc_fpr': fpr.tolist(),
#             'roc_tpr': tpr.tolist(),
            'confusion_matrix': cm_dict,
            'num_features': len(self.features)
        }
        
        print()
        
        print("False Positive errors (our model predicted NLOC but was actually LOC):")
        for _idx in test_dataset.index[(np.ravel(test_labels.values) == 1.0) & (y_score == 0.0)]:
            row = self.data_source.loc[_idx]
            print(" FP : %s vs %s" % (row['mark'], row['earlier_mark']))
        
        print()
        
        print("False Negative errors (our model predicted LOC but was actually NLOC):")
        for _idx in test_dataset.index[(np.ravel(test_labels.values) == 0.0) & (y_score == 1.0)]:
            row = self.data_source.loc[_idx]
            print(" FN : %s vs %s" % (row['mark'], row['earlier_mark']))
            
        print()
        print(self.metrics)

### Define the feature set

- TODO: add other feature suggestions ...

In [15]:
import jellyfish

feature_gen = {
#     'ALWAYS_0': lambda m1,m2: 0
    'EXACT_MATCH': lambda m1,m2: m1 == m2,
    'CASE_INSENSITIVE_MATCH': lambda m1,m2: m1.lower() == m2.lower(),
    'WORD_COUNT_DIFF': lambda m1,m2: abs(len(m1.split()) - len(m2.split())),
    'LEVENSHTEIN_DISTANCE': lambda m1,m2: jellyfish.levenshtein_distance(m1.lower(), m2.lower())
}

### Execute!

- Returns error metrics (f1_score, precision, recall, confusion matrix)
- Example of false positives + true negatives
- Use these as a feedback loop to come up with new features

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

ModelBuilder(data_set, 
             feature_gen, 
             #LogisticRegression(solver='lbfgs')
             #SVC(gamma='scale')
             RandomForestClassifier(random_state=0, n_estimators=50)
            ).build()

Generating EXACT_MATCH ... 
Generating CASE_INSENSITIVE_MATCH ... 
Generating WORD_COUNT_DIFF ... 
Generating LEVENSHTEIN_DISTANCE ... 

False Positive errors (our model predicted NLOC but was actually LOC):
 FP : IPRANASAL vs HIPRA
 FP : INDASEC DERMOSILK vs DERMASILK
 FP : NQL CONTENTANYWHERE vs NQL
 FP : OLYMPIC AIRLINES vs THE OLYMPICS
 FP : comfuture vs FUTURECOM
 FP : CRABTREE & EVELYN vs EVELYN
 FP : MOUNTAIN LIFE vs LIFE
 FP : SKYCADDIE vs SKY
 FP : SUPER SHARP TUBE vs SHARP
 FP : PASSION vs RED PASSION
 FP : BEE vs SAVEBEE
 FP : monbianco vs BIANCO.
 FP : TALKING HEAD vs HEAD

False Negative errors (our model predicted LOC but was actually NLOC):
 FN : RAIDER vs REBER
 FN : BEE ON vs FEMIBION
 FN : WHERE IMAGINATION BEGINS vs IMAGINARIUM
 FN : VivoMega vs OMEGA
 FN : tecsma vs SMA
 FN : SEVA vs SERA
 FN : LIPRIDIA vs VIPIDIA
 FN : Neo Classic vs NEOSS
 FN : IQ4HEALTH vs iHealth
 FN : DIAGOS vs DIA
 FN : PORTIC vs PORTICO
 FN : Binteract vs BINTERNET
 FN : easyswap vs EASYCAR
 