### Load 500 examples of EURIPO trademark court decisions

* Columns: 'mark', 'earlier_mark', 'target_label'
* 250 examples of NLOC (target_label == 0), 250 examples of LOC (target_label == 1)

In [14]:
import pandas as pd

data_set = pd.read_csv('../500_tm_loc_decisions.csv', index_col=0)
data_set.head()


Unnamed: 0,mark,earlier_mark,target_label
2373,LOOP,JOOP!,1
5793,ABACELL,AVICEL,1
294,SUVIQUE,ZUBIQUE,1
1272,EVELYN,EBELIN,1
1990,EBELIN,EVELYN,1


### Model builder class

* Instantiate with a data source, a features dict (key = name, value = transform function) and an instantiated classifer algorithm

In [15]:

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import auc, roc_curve, roc_auc_score, f1_score, recall_score, precision_score, confusion_matrix

class ModelBuilder:
    
    def __init__(self, data_source, features, algorithm):
        self.data_source = data_source
        self.features = features
        self.algorithm = algorithm
        self.metrics = {}
        self.false_positives = []
        self.false_negatives = []        
        
    def predict(self, mark, earlier_mark):
        p_vals = []
        for feature_key, feature_func in self.features.items():
            p_vals.append(feature_func(mark, earlier_mark))
        
        frame = pd.DataFrame([p_vals], columns=self.features.keys())
        selected = frame.loc[:, ]
        prediction = self.pipeline.predict(selected)[0]
        predicted_probs = self.pipeline.predict_proba(selected)[0]
        
        print("Prediction: %d (%s) probability = NLOC=%.3f, LOC=%.3f" % (prediction, 
                                                                         'LOC' if prediction else 'NLOC', 
                                                                         predicted_probs[0],
                                                                         predicted_probs[1]))
    def build(self):
        # Build the model features ...
        for feature_key, feature_func in self.features.items():
            print("Generating %s ... " % feature_key)
            for _index, row in self.data_source.iterrows():
                self.data_source.at[_index, feature_key] = feature_func(row['mark'], row['earlier_mark'])
            
        X = self.data_source[self.features.keys()]
        y = self.data_source['target_label']
        
        train_dataset, test_dataset, train_labels, test_labels = train_test_split(
            X, y, test_size=0.25, random_state=0)
        
        self.pipeline = Pipeline([
            ('scaler', preprocessing.MinMaxScaler()),
            ('clf', self.algorithm)
        ])
        
        self.pipeline.fit(train_dataset, train_labels)
        
        y_score = self.pipeline.predict(test_dataset)

        cm = confusion_matrix(test_labels, y_score, labels=[1,0])
        
        cm_dict = {
            'TP': int(cm[0][0]),
            'FP': int(cm[0][1]),
            'FN': int(cm[1][0]),
            'TN': int(cm[1][1])
        }
        
        err_metrics_pretty = (
            'A        Predicted       Error Metrics\n' 
            'c       True False\n'
            't  True  %3d   %3d     F1 Score %6.3f\n' 
            'u                     Precision %6.3f\n'
            'a False  %3d   %3d       Recall %6.3f\n'
            'l                       ROC AUC %6.3f') % (int(cm[0][0]), 
                                                        int(cm[0][1]), 
                                                        f1_score(test_labels, y_score),
                                                        precision_score(test_labels, y_score),
                                                        int(cm[1][0]), 
                                                        int(cm[1][1]),
                                                        recall_score(test_labels, y_score),
                                                        roc_auc_score(test_labels, y_score))
 
        
        fpr, tpr, thresholds = roc_curve(test_labels, y_score)
                
        cnt = None
        for cnt, _idx in enumerate(test_dataset.index[(np.ravel(test_labels.values) == 1.0) & (y_score == 0.0)]):
            row = self.data_source.loc[_idx]
            self.false_positives.append(" FP : %s vs %s" % (row['mark'], row['earlier_mark']))
        
        cnt = None
        for cnt, _idx in enumerate(test_dataset.index[(np.ravel(test_labels.values) == 0.0) & (y_score == 1.0)]):
            row = self.data_source.loc[_idx]
            self.false_negatives.append(" FN : %s vs %s" % (row['mark'], row['earlier_mark']))
            
        print()
        print(err_metrics_pretty)
        return self

### Feature Class

If you want to add new features then define them here as methods and wire them up in the `feature_gen` map.

In [16]:
import jellyfish
from fuzzywuzzy import fuzz

class TrademarkFeatures:

    def __init__(self, mark, earlier_mark):
        self.mark = mark
        self.earlier_mark = earlier_mark

    def levenshtein_distance(self):
        return jellyfish.levenshtein_distance(self.mark.lower(), self.earlier_mark.lower())
    
    

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

### Define the feature set

- TODO: add other feature suggestions ...

In [18]:


feature_gen = {
    'ALWAYS_0': lambda m1,m2: 0
    #'CASE_INSENSITIVE_MATCH': lambda mark, earlier_mark: mark.lower() == earlier_mark.lower(),
    #'LEVENSHTEIN_DISTANCE': lambda mark, earlier_mark: TrademarkFeatures(mark, earlier_mark).levenshtein_distance() 
}

### Execute!

- Returns error metrics (f1_score, precision, recall, confusion matrix)
- Example of false positives + true negatives
- Use these as a feedback loop to come up with new features

In [19]:
print("Model A - Candidate Model")
print("---------------------")
model_A = ModelBuilder(data_set, 
             feature_gen, 
             LogisticRegression(solver='lbfgs'),
             #KNeighborsClassifier(),
            ).build()

Model A - Candidate Model
---------------------
Generating ALWAYS_0 ... 

A        Predicted       Error Metrics
c       True False
t  True   62     0     F1 Score  0.663
u                     Precision  0.496
a False   63     0       Recall  1.000
l                       ROC AUC  0.500


In [20]:
print("Model B")
print("---------------------")
model_B = ModelBuilder(data_set, 
             feature_gen, 
             LogisticRegression(solver='lbfgs'),
             #AdaBoostClassifier(),
            ).build()
print("\nFalse Positive errors (our model predicted NLOC but was actually LOC):")
print("\n".join(model_B.false_positives))
print("\nFalse Negative errors (our model predicted LOC but was actually NLOC):")
print("\n".join(model_B.false_negatives))

Model B
---------------------
Generating ALWAYS_0 ... 

A        Predicted       Error Metrics
c       True False
t  True   62     0     F1 Score  0.663
u                     Precision  0.496
a False   63     0       Recall  1.000
l                       ROC AUC  0.500

False Positive errors (our model predicted NLOC but was actually LOC):


False Negative errors (our model predicted LOC but was actually NLOC):
 FN : ROADFLIRT vs FlirtCast
 FN : shoo-be-doo vs SEA-DOO
 FN : RELEASE THE BEAST vs REHAB THE BEAST!
 FN : WINE PASSION vs WINES OF PASSION
 FN : ARCOX vs ARCAL
 FN : i.am vs IM+
 FN : RUBINO ROSSO vs RIBENA
 FN : Adero vs ATEGO
 FN : Xplus vs CYPLUS
 FN : IVINCI vs DA VINCI
 FN : SEVA vs SERA
 FN : STRESSEN vs STRESSGEN
 FN : LACTOFIDUS vs LACTOFIL ULTRALIFE
 FN : SUNLINE vs SUN MICROSYSTEMS FINANCE
 FN : Stonecare vs STONCOR
 FN : RELEASE THE BEAST vs UNLEASH THE NITRO BEAST!
 FN : SISVEL WE PROTECT IDEAS vs IDEAS TV
 FN : PELI PROGEAR vs PELÉ
 FN : BROOKS BROTHERS vs Croops


### Make a prediction

So you're feeling lucky huh? Let's see what happens when we try to predict the outcome of a few trademarks we haven't seen in the training set. 

In [21]:
model_A.predict("TOBFIN", "TAPFINN")

Prediction: 1 (LOC) probability = NLOC=0.499, LOC=0.501


In [22]:
model_A.predict("QUINTA DOM VICENTE", "HERO")

Prediction: 1 (LOC) probability = NLOC=0.499, LOC=0.501


In [23]:
model_A.predict("RED DRAGON", "GREEN DRAGON")

Prediction: 1 (LOC) probability = NLOC=0.499, LOC=0.501
