# Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn import datasets


In [2]:
cases = pd.read_csv('Data.nosync/Similar_TM.csv')
cases

Unnamed: 0,TM_A,TM_B,Decision
0,Simoniz,Permanize,1
1,Magnavoc,Multivox,1
2,Zirco,Cozirc,1
3,Platinum Puff,Platinum Plus,1
4,Maternity Yours,Your Maternity Shop,1
...,...,...,...
352,Lilton,Wilton,0
353,Nutricia,Nutritea,0
354,Glenreidh,An Reidhe,0
355,No Gunk No Junk,No Gunk Just Funk,0


In [3]:
import unidecode
from fuzzywuzzy import fuzz

from abydos.distance import (IterativeSubString, BISIM, DiscountedLevenshtein, Prefix, LCSstr, MLIPNS, Strcmp95,
MRA, Editex, SAPS, FlexMetric, JaroWinkler, HigueraMico, Sift4, Eudex, ALINE, PhoneticEditDistance)

from abydos.phonetic import PSHPSoundexFirst, Ainsworth
pshp_soundex_first = PSHPSoundexFirst()
pe = Ainsworth()

In [4]:
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
phonetic_edit = PhoneticEditDistance()

In [5]:
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
         aline, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
              'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline',
              'phoneticeditdistance']

In [6]:
from abydos.phones import *

In [7]:
def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score

In [8]:
import re

In [9]:
from tqdm import tqdm

def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['TM_A'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1)
    df['TM_B'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1)
    

    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.progress_apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1)

    return df

In [10]:

from itertools import combinations
import random
random.seed(30)

In [11]:
dftest=cases.loc[[2]]
dftest

Unnamed: 0,TM_A,TM_B,Decision
2,Zirco,Cozirc,1


In [12]:
tqdm.pandas()

df = featurize(cases)
df

  from pandas import Panel
100%|██████████| 357/357 [00:00<00:00, 14983.55it/s]
100%|██████████| 357/357 [00:00<00:00, 4844.95it/s]
100%|██████████| 357/357 [00:00<00:00, 3555.78it/s]
100%|██████████| 357/357 [00:00<00:00, 22298.50it/s]
100%|██████████| 357/357 [00:00<00:00, 12395.42it/s]
100%|██████████| 357/357 [00:00<00:00, 23828.24it/s]
100%|██████████| 357/357 [00:00<00:00, 13957.30it/s]
100%|██████████| 357/357 [00:00<00:00, 12443.42it/s]
100%|██████████| 357/357 [00:00<00:00, 2149.89it/s]
100%|██████████| 357/357 [00:00<00:00, 3237.89it/s]
100%|██████████| 357/357 [00:00<00:00, 2458.19it/s]
100%|██████████| 357/357 [00:00<00:00, 13097.57it/s]
100%|██████████| 357/357 [00:01<00:00, 248.76it/s]
100%|██████████| 357/357 [00:00<00:00, 16616.91it/s]
100%|██████████| 357/357 [00:00<00:00, 10659.92it/s]
100%|██████████| 357/357 [00:01<00:00, 195.11it/s]
100%|██████████| 357/357 [00:00<00:00, 1130.52it/s]


Unnamed: 0,a,b,target,TM_A,TM_B,partial,tkn_sort,tkn_set,sum_ipa,pshp_soundex_first,...,mra,editex,saps,flexmetric,jaro,higueramico,sift4,eudex,aline,phoneticeditdistance
0,Simoniz,Permanize,1,simoniz,permanize,57,50,50,0.658986,0,...,0.666667,0.555556,0.137931,0.566667,0.671958,0.430556,0.444444,0.654902,0.602128,0.727599
1,Magnavoc,Multivox,1,magnavoc,multivox,38,38,38,0.691532,0,...,0.500000,0.500000,0.304348,0.400000,0.583333,0.375000,0.375000,0.916667,0.545455,0.897177
2,Zirco,Cozirc,1,zirco,cozirc,89,73,73,0.806452,0,...,0.833333,0.500000,0.428571,0.600000,0.822222,0.547619,0.666667,0.786275,0.705882,0.634409
3,Platinum Puff,Platinum Plus,1,platinumpuff,platinumplus,83,83,83,0.895161,1,...,0.500000,0.833333,0.666667,0.833333,0.888889,0.769231,0.833333,1.000000,0.873529,0.913978
4,Maternity Yours,Your Maternity Shop,1,maternityyours,yourmaternityshop,74,65,65,0.743176,0,...,0.000000,0.588235,0.319149,0.538235,0.687675,0.490372,0.588235,0.463725,0.589655,0.646110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,Lilton,Wilton,0,lilton,wilton,83,83,83,0.943548,0,...,0.833333,0.833333,0.500000,0.833333,0.822222,0.833333,0.833333,0.866667,0.823529,0.943548
353,Nutricia,Nutritea,0,nutricia,nutritea,75,75,75,0.874552,1,...,0.833333,0.812500,0.565217,0.787500,0.833333,0.750000,0.750000,0.997549,0.937500,0.973790
354,Glenreidh,An Reidhe,0,glenreidh,anreidhe,80,71,71,0.498208,0,...,0.666667,0.666667,0.310345,0.666667,0.741402,0.588889,0.666667,0.784804,0.705882,0.655914
355,No Gunk No Junk,No Gunk Just Funk,0,nogunknojunk,nogunkjustfunk,67,77,77,0.822581,1,...,0.833333,0.678571,0.469388,0.700000,0.815873,0.637363,0.714286,0.995588,0.713415,0.808756


#### Export clean data to csv

In [None]:
# df.to_csv('Data.nosync/TM_features.csv', index=False)  

In [30]:
y = df.target
X = df.drop(columns = 'target')

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## TPOT AutoML pipeline

In [16]:
from tpot import TPOTClassifier

In [None]:
# pipeline_optimizer = TPOTClassifier(
#         scoring = 'f1', 
#         generations=100,
#         verbosity=2,
#         n_jobs=-1   # Utilizes all available CPU cores
#         ) 
# pipeline_optimizer.fit(X_train.drop(['a', 'b', 'TM_A', 'TM_B'],1), y_train)

In [None]:
# print(pipeline_optimizer.score(X_test.drop(['a', 'b', 'TM_A', 'TM_B'], 1), y_test))

Export TPOT pipeline

In [None]:
# pipeline_optimizer.export('tpot_exported_calssifier_pipeline.py')

Load TPOT pipeline

In [24]:
df_TPOT = df.drop(columns = ['a','b'])
df_TPOT

Unnamed: 0,target,TM_A,TM_B,partial,tkn_sort,tkn_set,sum_ipa,pshp_soundex_first,iterativesubstring,bisim,...,mra,editex,saps,flexmetric,jaro,higueramico,sift4,eudex,aline,phoneticeditdistance
0,1,simoniz,permanize,57,50,50,0.658986,0,0.485480,0.388889,...,0.666667,0.555556,0.137931,0.566667,0.671958,0.430556,0.444444,0.654902,0.602128,0.727599
1,1,magnavoc,multivox,38,38,38,0.691532,0,0.050000,0.437500,...,0.500000,0.500000,0.304348,0.400000,0.583333,0.375000,0.375000,0.916667,0.545455,0.897177
2,1,zirco,cozirc,89,73,73,0.806452,0,0.821263,0.583333,...,0.833333,0.500000,0.428571,0.600000,0.822222,0.547619,0.666667,0.786275,0.705882,0.634409
3,1,platinumpuff,platinumplus,83,83,83,0.895161,1,0.884677,0.833333,...,0.500000,0.833333,0.666667,0.833333,0.888889,0.769231,0.833333,1.000000,0.873529,0.913978
4,1,maternityyours,yourmaternityshop,74,65,65,0.743176,0,0.951613,0.588235,...,0.000000,0.588235,0.319149,0.538235,0.687675,0.490372,0.588235,0.463725,0.589655,0.646110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,0,lilton,wilton,83,83,83,0.943548,0,0.897436,0.750000,...,0.833333,0.833333,0.500000,0.833333,0.822222,0.833333,0.833333,0.866667,0.823529,0.943548
353,0,nutricia,nutritea,75,75,75,0.874552,1,0.804167,0.750000,...,0.833333,0.812500,0.565217,0.787500,0.833333,0.750000,0.750000,0.997549,0.937500,0.973790
354,0,glenreidh,anreidhe,80,71,71,0.498208,0,0.800858,0.611111,...,0.666667,0.666667,0.310345,0.666667,0.741402,0.588889,0.666667,0.784804,0.705882,0.655914
355,0,nogunknojunk,nogunkjustfunk,67,77,77,0.822581,1,0.852383,0.678571,...,0.833333,0.678571,0.469388,0.700000,0.815873,0.637363,0.714286,0.995588,0.713415,0.808756


In [33]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

In [25]:
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = df_TPOT
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8945711361541637
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    GradientBoostingClassifier(learning_rate=0.01, max_depth=3, max_features=0.5, min_samples_leaf=10, min_samples_split=3, n_estimators=100, subsample=0.9500000000000001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

ValueError: could not convert string to float: 'pip'

In [None]:
def base_model_1(X_train, X_test, y_train, y_test, export = False) :
    

### TPOT AutoML Regression

In [None]:
from tpot import TPOTRegressor

pipeline_optimizer = TPOTRegressor(
        scoring = 'f1', 
        generations=100,
        verbosity=2,
        n_jobs=-1   # Utilizes all available CPU cores
        ) 
pipeline_optimizer.fit(X_train.drop(['a', 'b', 'TM_A', 'TM_B'],1), y_train)

exported_pipeline = make_pipeline(