# Boosted Gradient:Levenshtein

In [72]:
%pylab inline

#For data
import pandas as pd

#For modelling
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import Levenshtein as lev

Populating the interactive namespace from numpy and matplotlib


## Importation of data

In [73]:
communes_dataframe = pd.read_csv("COMMUNES.csv", delimiter=";")
communes_dataframe

Unnamed: 0,insee,nom,altitude,code_postal,longitude,latitude,pop99,surface,departement,region,indicatif
0,1001,L'Abergement-Clémenciat,257.0,1400,0.085856,0.805508,728,15.70,1,RA,4
1,1002,L'Abergement-de-Varey,367.0,1640,0.094665,0.802987,168,9.14,1,RA,4
2,1004,Ambérieu-en-Bugey,247.0,1500,0.093356,0.802085,11436,24.51,1,RA,4
3,1005,Ambérieux-en-Dombes,293.0,1330,0.085594,0.802784,1408,16.06,1,RA,4
4,1006,Ambléon,400.0,1300,0.097777,0.798493,86,6.03,1,RA,4
5,1007,Ambronay,250.0,1500,0.093540,0.802992,2146,33.64,1,RA,4
6,1008,Ambutrix,270.0,1500,0.093239,0.801795,586,5.19,1,RA,4
7,1009,Andert-et-Condon,330.0,1300,0.098718,0.799283,275,6.95,1,RA,4
8,1010,Anglefort,250.0,1350,0.101384,0.801315,769,29.49,1,RA,4
9,1011,Apremont,900.0,1100,0.098849,0.806517,329,15.09,1,RA,4


## Preparation of data

### Creation of medians/region

In [74]:
ALL_COMMUNES=communes_dataframe.values
ALL_COMMUNES[where(ALL_COMMUNES[:,10]==1)]

array([[75056, 'Paris', 60.0, ..., 75, 'IF', 1],
       [77001, 'Achères-la-Forêt', 110.0, ..., 77, 'IF', 1],
       [77002, 'Amillis', 110.0, ..., 77, 'IF', 1],
       ...,
       [95680, 'Villiers-le-Bel', 104.0, ..., 95, 'IF', 1],
       [95682, 'Villiers-le-Sec', 130.0, ..., 95, 'IF', 1],
       [95690, 'Wy-dit-Joli-Village', 125.0, ..., 95, 'IF', 1]],
      dtype=object)

In [75]:
COMMUNES, TEST_COMMUNES = train_test_split(ALL_COMMUNES, test_size=0.099, random_state=10)

In [76]:
lev.median(['lille','mille','chilo'])

'cille'

In [77]:
#set(ALL_COMMUNES[:,9])

In [78]:
def median_per_region(INPUT=COMMUNES,SET=True,Improved=10):
    
    OUTPUT=[]
    regions=set(INPUT[:,9])
    
    for i in regions:
        
        print('current region ',i)
        tmp=INPUT[where(INPUT[:,9]==i)]
        tmp2=tmp[:,1]
        
        if SET: median=lev.setmedian(tmp2)
        else: median=lev.median(tmp2)
            
        print('The median is ',median)
        
        
        if Improved>0:
            improve_median=lev.median_improve(median,tmp2)
            for j in range(10):
                improve_median=lev.median_improve(improve_median,tmp2)
            print('The improved median is ',improve_median)
            median=improve_median
        
        OUTPUT.append(median)
        print('\n','--------------')
        
    return OUTPUT

print(COMMUNES[0:5,:],'\n','--------------')
median_per_region(COMMUNES[0:5,:])
        

[[72163 'Ligron' 95.0 '72270' 0.00012119565958728001 0.833801981737 358
  13.67 72 'PL' 2]
 [67456 'Schopperten' 220.0 '67260' 0.12305056754116 0.8542513970623 292
  4.42 67 'AL' 3]
 [27588 "Saint-Pierre-d'Autils" 20.0 '27950' 0.025084256050807996
  0.8572960559785301 1036 7.18 27 'HN' 2]
 [60264 'Frocourt' 87.0 '60000' 0.03638526362326 0.86188235227079 514
  6.49 60 'PI' 3]
 [77496 'Vieux-Champagne' 130.0 '77370' 0.054793636450322994
  0.8479003570737099 190 8.89 77 'IF' 1]] 
 --------------
current region  HN
The median is  Saint-Pierre-d'Autils
The improved median is  Saint-Pierre-d'Autils

 --------------
current region  PI
The median is  Frocourt
The improved median is  Frocourt

 --------------
current region  AL
The median is  Schopperten
The improved median is  Schopperten

 --------------
current region  IF
The median is  Vieux-Champagne
The improved median is  Vieux-Champagne

 --------------
current region  PL
The median is  Ligron
The improved median is  Ligron

 ----------

["Saint-Pierre-d'Autils",
 'Frocourt',
 'Schopperten',
 'Vieux-Champagne',
 'Ligron']

In [79]:
medians=median_per_region()

current region  PI
The median is  Maulers
The improved median is  aineure

 --------------
current region  BN
The median is  Banville
The improved median is  Saneil-e

 --------------
current region  BO
The median is  Saules
The improved median is  aines

 --------------
current region  PA
The median is  Sannes
The improved median is  Saines

 --------------
current region  CA
The median is  Caurel
The improved median is  aie-ure

 --------------
current region  RA
The median is  Baneins
The improved median is  San-ene

 --------------
current region  BR
The median is  Santec
The improved median is  Lan-en

 --------------
current region  AL
The median is  Biesheim
The improved median is  iesheie

 --------------
current region  PC
The median is  Saintes
The improved median is  Sainae

 --------------
current region  HN
The median is  Barville
The improved median is  aineille

 --------------
current region  LO
The median is  Barville
The improved median is  aenilre

 --------------
cu

## Calculation of the distances betwenn cities and medians

In [93]:
def lev_data(INPUT=COMMUNES,B=True,medians=medians):
    
    DATA=INPUT[:,1]
    OUTPUT=[]
    
    for s in DATA:
        
        tmp=[]
        
        for m in medians:
        
            if(B):tmp.append(lev.distance(s,m))
            
            else:tmp.append(lev.jaro(s,m))
            
        OUTPUT.append(array(tmp))
        
    return array(OUTPUT)

lev_data(COMMUNES[1:5,:])

array([[ 9,  9, 10,  9,  9,  8,  9,  9,  9, 10,  9,  8,  9,  9,  8,  8,
        10, 10,  8,  9,  8],
       [15, 16, 16, 15, 16, 15, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15,
        16, 16, 18, 16, 14],
       [ 6,  8,  8,  8,  6,  8,  8,  8,  8,  8,  7,  8,  8,  7,  8,  8,
         8,  8,  7,  8,  8],
       [12, 13, 13, 13, 11, 12, 13, 11, 12, 13, 13, 12, 12, 12, 13, 12,
        12, 12, 11, 11, 14]])

In [94]:
lev_data(COMMUNES[1:5,:],False)

array([[0.56709957, 0.54924242, 0.43030303, 0.50505051, 0.56709957,
        0.56709957, 0.41919192, 0.56709957, 0.50505051, 0.47727273,
        0.32251082, 0.50505051, 0.50505051, 0.56709957, 0.56709957,
        0.54924242, 0.41919192, 0.41919192, 0.50505051, 0.48917749,
        0.45598846],
       [0.58412698, 0.62301587, 0.58095238, 0.61904762, 0.58412698,
        0.65079365, 0.54761905, 0.41269841, 0.69047619, 0.62103175,
        0.51746032, 0.61904762, 0.69047619, 0.65873016, 0.65079365,
        0.68849206, 0.69047619, 0.54761905, 0.54761905, 0.65079365,
        0.71428571],
       [0.3452381 , 0.        , 0.        , 0.        , 0.3452381 ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.42261905, 0.        , 0.        , 0.42261905, 0.        ,
        0.        , 0.        , 0.        , 0.43055556, 0.        ,
        0.42261905],
       [0.54285714, 0.41388889, 0.51111111, 0.48888889, 0.52936508,
        0.30634921, 0.32222222, 0.54285714, 0.4888888

In [95]:
X_lev_train=lev_data(COMMUNES)
X_lev_test=lev_data(TEST_COMMUNES)

X_jaro_train=lev_data(COMMUNES,False)
X_jaro_test=lev_data(TEST_COMMUNES,False)

shape(X_jaro_test)

(3585, 21)

In [96]:
def response_initialization(INPUT):
    
    #print(INPUT)
    
    i=1
    OUTPUT=INPUT
    TMP=[]
    
    for e in INPUT:
        
        if TMP.count(e)<= 0: TMP.append(e)
            
    #print(TMP)
    
    for e in TMP:
            
            OUTPUT[where(OUTPUT==e)]=i
            i+=1
        
    return OUTPUT   

In [97]:
Y_train=response_initialization(COMMUNES[:,9])
Y_test=response_initialization(TEST_COMMUNES[:,9])
shape(Y_test)

(3585,)

In [98]:
dtrain_lev=xgb.DMatrix(X_lev_train,Y_train)
dtest_lev=xgb.DMatrix(X_lev_test,Y_test)

dtrain_jaro=xgb.DMatrix(X_jaro_train,Y_train)
dtest_jaro=xgb.DMatrix(X_jaro_test,Y_test)

  "memory consumption")


## Model creation

In [99]:
params = {'booster':'gbtree','objective': 'multi:softprob','num_class':22,'eval_metric':'mlogloss','max_depth':3,'nthread':10}

In [100]:
evallist = [(dtest_lev, 'eval'), (dtrain_lev, 'train')]
num_round = 500

xg_tree=xgb.train(params, dtrain_lev, num_round,evallist,early_stopping_rounds=10)

[0]	eval-mlogloss:3.09034	train-mlogloss:3.04148
Multiple eval metrics have been passed: 'train-mlogloss' will be used for early stopping.

Will train until train-mlogloss hasn't improved in 10 rounds.
[1]	eval-mlogloss:3.09285	train-mlogloss:3.00502
[2]	eval-mlogloss:3.09771	train-mlogloss:2.97857
[3]	eval-mlogloss:3.10522	train-mlogloss:2.95817
[4]	eval-mlogloss:3.11278	train-mlogloss:2.94100
[5]	eval-mlogloss:3.12047	train-mlogloss:2.92720
[6]	eval-mlogloss:3.12747	train-mlogloss:2.91594
[7]	eval-mlogloss:3.13452	train-mlogloss:2.90536
[8]	eval-mlogloss:3.14114	train-mlogloss:2.89687
[9]	eval-mlogloss:3.14777	train-mlogloss:2.88995
[10]	eval-mlogloss:3.15388	train-mlogloss:2.88293
[11]	eval-mlogloss:3.15917	train-mlogloss:2.87661
[12]	eval-mlogloss:3.16439	train-mlogloss:2.87119
[13]	eval-mlogloss:3.16860	train-mlogloss:2.86599
[14]	eval-mlogloss:3.17244	train-mlogloss:2.86170
[15]	eval-mlogloss:3.17694	train-mlogloss:2.85724
[16]	eval-mlogloss:3.18000	train-mlogloss:2.85287
[17]	ev

[160]	eval-mlogloss:3.34728	train-mlogloss:2.61182
[161]	eval-mlogloss:3.34783	train-mlogloss:2.61065
[162]	eval-mlogloss:3.34828	train-mlogloss:2.60939
[163]	eval-mlogloss:3.34835	train-mlogloss:2.60831
[164]	eval-mlogloss:3.34925	train-mlogloss:2.60706
[165]	eval-mlogloss:3.35033	train-mlogloss:2.60578
[166]	eval-mlogloss:3.35091	train-mlogloss:2.60439
[167]	eval-mlogloss:3.35169	train-mlogloss:2.60328
[168]	eval-mlogloss:3.35291	train-mlogloss:2.60202
[169]	eval-mlogloss:3.35427	train-mlogloss:2.60088
[170]	eval-mlogloss:3.35512	train-mlogloss:2.59979
[171]	eval-mlogloss:3.35571	train-mlogloss:2.59854
[172]	eval-mlogloss:3.35620	train-mlogloss:2.59715
[173]	eval-mlogloss:3.35742	train-mlogloss:2.59606
[174]	eval-mlogloss:3.35786	train-mlogloss:2.59492
[175]	eval-mlogloss:3.35863	train-mlogloss:2.59389
[176]	eval-mlogloss:3.35938	train-mlogloss:2.59282
[177]	eval-mlogloss:3.35999	train-mlogloss:2.59171
[178]	eval-mlogloss:3.36093	train-mlogloss:2.59069
[179]	eval-mlogloss:3.36127	tra

[321]	eval-mlogloss:3.45407	train-mlogloss:2.45200
[322]	eval-mlogloss:3.45460	train-mlogloss:2.45124
[323]	eval-mlogloss:3.45560	train-mlogloss:2.45053
[324]	eval-mlogloss:3.45648	train-mlogloss:2.44966
[325]	eval-mlogloss:3.45692	train-mlogloss:2.44878
[326]	eval-mlogloss:3.45746	train-mlogloss:2.44791
[327]	eval-mlogloss:3.45796	train-mlogloss:2.44700
[328]	eval-mlogloss:3.45843	train-mlogloss:2.44612
[329]	eval-mlogloss:3.45853	train-mlogloss:2.44529
[330]	eval-mlogloss:3.45886	train-mlogloss:2.44440
[331]	eval-mlogloss:3.45948	train-mlogloss:2.44349
[332]	eval-mlogloss:3.46004	train-mlogloss:2.44249
[333]	eval-mlogloss:3.46045	train-mlogloss:2.44150
[334]	eval-mlogloss:3.46063	train-mlogloss:2.44070
[335]	eval-mlogloss:3.46137	train-mlogloss:2.43978
[336]	eval-mlogloss:3.46178	train-mlogloss:2.43914
[337]	eval-mlogloss:3.46215	train-mlogloss:2.43826
[338]	eval-mlogloss:3.46294	train-mlogloss:2.43749
[339]	eval-mlogloss:3.46340	train-mlogloss:2.43681
[340]	eval-mlogloss:3.46353	tra

[482]	eval-mlogloss:3.53993	train-mlogloss:2.33412
[483]	eval-mlogloss:3.54074	train-mlogloss:2.33336
[484]	eval-mlogloss:3.54142	train-mlogloss:2.33271
[485]	eval-mlogloss:3.54195	train-mlogloss:2.33224
[486]	eval-mlogloss:3.54221	train-mlogloss:2.33154
[487]	eval-mlogloss:3.54275	train-mlogloss:2.33100
[488]	eval-mlogloss:3.54293	train-mlogloss:2.33044
[489]	eval-mlogloss:3.54328	train-mlogloss:2.32970
[490]	eval-mlogloss:3.54380	train-mlogloss:2.32905
[491]	eval-mlogloss:3.54420	train-mlogloss:2.32832
[492]	eval-mlogloss:3.54484	train-mlogloss:2.32765
[493]	eval-mlogloss:3.54536	train-mlogloss:2.32694
[494]	eval-mlogloss:3.54555	train-mlogloss:2.32637
[495]	eval-mlogloss:3.54556	train-mlogloss:2.32560
[496]	eval-mlogloss:3.54609	train-mlogloss:2.32483
[497]	eval-mlogloss:3.54665	train-mlogloss:2.32418
[498]	eval-mlogloss:3.54713	train-mlogloss:2.32342
[499]	eval-mlogloss:3.54740	train-mlogloss:2.32271


In [106]:
xg_tree.attributes()

{'best_iteration': '499',
 'best_msg': '[499]\teval-mlogloss:3.54740\ttrain-mlogloss:2.32271',
 'best_score': '2.322708'}