In [1]:
import recordlinkage
import pandas as pd 

In [2]:
country_df = pd.read_csv('./top50countryDos.csv')#, encoding='windows-1251')
print(country_df.shape)
country_df.head()

(476, 17)


Unnamed: 0,clave,titulo,artista,genero,anio,agregado,bpm,energia,baile,deciBeles,vivo,val,duracion,acustica,palabra,pop,pais
0,758,"10,000 Hours (with Justin Bieber)",Dan + Shay,contemporary country,2019,31/12/1969,90.0,63.0,65.0,-5.0,11.0,43.0,168,15.0,3.0,93,indonesia
1,777,100 Degrees,Rich Brian,indonesian hip hop,2019,31/12/1969,81.0,65.0,76.0,-5.0,52.0,66.0,166,12.0,7.0,77,indonesia
2,533,105 F Remix,KEVVO,perreo,2019,31/12/1969,100.0,75.0,90.0,-7.0,29.0,74.0,464,37.0,16.0,83,chile
3,191,2000 Miles - 2007 Remaster,Pretenders,album rock,1984,31/12/1969,66.0,71.0,22.0,-9.0,10.0,45.0,220,0.0,4.0,78,australia
4,314,3 Batidas - Ao Vivo,Guilherme & Benuto,sertanejo pop,2019,31/12/1969,118.0,71.0,55.0,-5.0,76.0,53.0,157,61.0,13.0,82,brazil


In [3]:
country_df.columns

Index(['clave', 'titulo', 'artista', 'genero', 'anio', 'agregado', 'bpm',
       'energia', 'baile', 'deciBeles', 'vivo', 'val', 'duracion', 'acustica',
       'palabra', 'pop', 'pais'],
      dtype='object')

In [4]:
from recordlinkage.preprocessing import phonetic

In [5]:
def compare(df, column, num=5):
    series = df[column][:num]
    sdx = phonetic(series, method='soundex')
    normal = series
    print ("{:<20} {:<20}".format('Normal', 'Soundex'))
    print('----------------------------')
    for i in range(5):
        print ("{:<20} {:<20}".format(normal[i], sdx[i]))

In [6]:
compare(country_df, 'artista')

Normal               Soundex             
----------------------------
Dan + Shay           D520                
Rich Brian           R216                
KEVVO                K100                
Pretenders           P635                
Guilherme & Benuto   G465                


In [7]:
compare(country_df, 'genero')

Normal               Soundex             
----------------------------
contemporary country C535                
indonesian hip hop   I535                
perreo               P600                
album rock           A415                
sertanejo pop        S635                


In [8]:
compare(country_df, 'pais')

Normal               Soundex             
----------------------------
indonesia            I535                
indonesia            I535                
chile                C400                
australia            A236                
brazil               B624                


In [9]:
country_df[country_df.pais.isna()]

Unnamed: 0,clave,titulo,artista,genero,anio,agregado,bpm,energia,baile,deciBeles,vivo,val,duracion,acustica,palabra,pop,pais


## Indexing

In [10]:
import recordlinkage as rl 

In [11]:
country_df = pd.read_csv('./top50countryDos.csv')#, encoding='windows-1251')

In [29]:
block_indexer = rl.Index()
block_indexer.block('titulo')
cl = block_indexer.index(country_df)

In [30]:
print(len(cl)) #29 registros a comparar

29


In [31]:
print(cl)

MultiIndex([(447, 379),
            (448, 380),
            (449, 381),
            (450, 382),
            (451, 383),
            (452, 384),
            (453, 385),
            (454, 386),
            (455, 387),
            (456, 388),
            (457, 389),
            (458, 390),
            (459, 391),
            (460, 392),
            (461, 393),
            (462, 394),
            (463, 395),
            (464, 396),
            (465, 397),
            (466, 398),
            (467, 399),
            (468, 400),
            (469, 401),
            (470, 402),
            (471, 403),
            (472, 404),
            (473, 405),
            (474, 406),
            (475, 407)],
           )


In [32]:
compare_cl = rl.Compare()
compare_cl.exact('titulo', 'titulo', label='song_title')
compare_cl.string('artista', 'artista', label='artist')
compare_cl.string('genero', 'genero', label='genre')
compare_cl.numeric('anio', 'anio', label='year')
compare_cl.exact('agregado', 'agregado', label='date_added')
compare_cl.string('pais', 'pais', label='country')
features = compare_cl.compute(cl, country_df)

#compare_cl.numeric('bpm', 'bpm', label='bpm')
#compare_cl.numeric('energia', 'energia', label='energy')
#compare_cl.numeric('baile', 'baile', label='danceabilty')
#compare_cl.numeric('deciBeles', 'deciBeles', label='dB')
#compare_cl.numeric('vivo', 'vivo', label='live')
#compare_cl.numeric('val', 'val', label='val')
#compare_cl.numeric('duracion', 'duracion', label='length')
#compare_cl.numeric('acustica', 'acustica', label='acoustics')
#compare_cl.numeric('palabra', 'palabra', label='word')
#compare_cl.numeric('pop', 'pop', label='pop')

In [33]:
features.head()

Unnamed: 0,Unnamed: 1,song_title,artist,genre,year,date_added,country
447,379,1,1.0,1.0,1.0,1,1.0
448,380,1,1.0,1.0,1.0,1,1.0
449,381,1,1.0,1.0,1.0,1,1.0
450,382,1,1.0,1.0,1.0,1,1.0
451,383,1,1.0,1.0,1.0,1,1.0


In [34]:
features.describe()

Unnamed: 0,song_title,artist,genre,year,date_added,country
count,29.0,29.0,29.0,29.0,29.0,29.0
mean,1.0,1.0,1.0,1.0,1.0,1.0
std,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
matches = features[features.sum(axis=1) > 5]
print(f'Registros correspondienes: {len(matches)} \nNo correspondientes: {len(features) - len(matches)}')

Registros correspondienes: 29 
No correspondientes: 0


In [36]:
len(matches[matches == True])

29

In [37]:
df = pd.read_csv('top50country.csv', encoding='windows-1251')
df.head()

Unnamed: 0,clave,titulo,artista,genero,anio,agregado,bpm,energia,baile,deciBeles,vivo,val,duracion,acustica,palabra,pop,pais
0,671,0.4,Aya Nakamura,basshall,2019,31/12/1969,92.0,69.0,84.0,-7.0,13.0,68.0,175,40.0,21.0,78,france
1,125,0.958333333,Maluma,latin,2019,31/12/1969,96.0,71.0,78.0,-5.0,9.0,68.0,176,22.0,28.0,88,argentina
2,273,0.958333333,Maluma,latin,2019,31/12/1969,96.0,71.0,78.0,-5.0,9.0,68.0,176,22.0,28.0,88,bolivia
3,472,0.958333333,Maluma,latin,2019,31/12/1969,96.0,71.0,78.0,-5.0,9.0,68.0,176,22.0,28.0,88,colombia
4,136,22.0,TINI,latin pop,2019,31/12/1969,176.0,64.0,70.0,-5.0,9.0,81.0,159,1.0,9.0,76,argentina


## Data Matching

In [38]:
import recordlinkage as rl
df_one = pd.read_csv('top50countryDos.csv', encoding='windows-1251')
df_two = pd.read_csv('top50countryTres.csv', encoding='windows-1251')

In [39]:
print(df_one.shape)
print(df_two.shape)

(476, 17)
(515, 17)


In [40]:
indexer = rl.BlockIndex(on='titulo')
candidate_links = indexer.index(df_one, df_two)

In [41]:
print(len(candidate_links))
print(candidate_links)

512
MultiIndex([(  0,   0),
            (  1,   1),
            (  2,   2),
            (  3,   3),
            (  4,   4),
            (  5,   5),
            (  6,   6),
            (  7,   7),
            ( 10,  10),
            ( 11,  11),
            ...
            (437, 505),
            (438, 506),
            (439, 507),
            (440, 508),
            (441, 509),
            (442, 510),
            (443, 511),
            (444, 512),
            (445, 513),
            (446, 514)],
           length=512)


In [42]:
compare_cl = rl.Compare()
compare_cl.exact('titulo', 'titulo', label='song_title')
compare_cl.string('artista', 'artista', label='artist')
compare_cl.string('genero', 'genero', label='genre')
compare_cl.numeric('anio', 'anio', label='year')
compare_cl.exact('agregado', 'agregado', label='date_added')
compare_cl.string('pais', 'pais', label='country')
features = compare_cl.compute(candidate_links, df_one, df_two)

In [43]:
features.head()

Unnamed: 0,Unnamed: 1,song_title,artist,genre,year,date_added,country
0,0,1,1.0,1.0,1.0,1,1.0
1,1,1,1.0,1.0,1.0,1,1.0
2,2,1,1.0,1.0,1.0,1,1.0
3,3,1,1.0,1.0,1.0,1,1.0
4,4,1,1.0,1.0,1.0,1,1.0


In [44]:
features.describe()

Unnamed: 0,song_title,artist,genre,year,date_added,country
count,512.0,512.0,512.0,512.0,512.0,512.0
mean,1.0,0.979244,0.984375,1.0,1.0,1.0
std,0.0,0.082853,0.124141,0.0,0.0,0.0
min,1.0,0.0,0.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [45]:
matches = features[features.sum(axis=1).values > 5]
print(len(matches))
matches.head()

503


Unnamed: 0,Unnamed: 1,song_title,artist,genre,year,date_added,country
0,0,1,1.0,1.0,1.0,1,1.0
1,1,1,1.0,1.0,1.0,1,1.0
2,2,1,1.0,1.0,1.0,1,1.0
3,3,1,1.0,1.0,1.0,1,1.0
4,4,1,1.0,1.0,1.0,1,1.0
