#### Install dependencies and import packages

In [1]:
from fuzzywuzzy import fuzz
import pandas as pd
import pickle
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

#### Connect to sqlite and read data sources

In [2]:
con = sqlite3.connect("db.db")

In [3]:
sr = pd.read_sql_query("SELECT * from soundrecording", con).set_index("id")
sr

Unnamed: 0_level_0,sr_id,title,artists,isrcs,contributors
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,spotify_apidsr__2NbYAPqE6FTyQte9kW4vgr,Astronomia - Tequila Edit,Bing Lee,ITZB42033435,Edizioni Lungoviaggio|Victor Pool|Ruben Christ...
2,crawler_believe__26052217,Astronomia (feat. Tish),"Marco Marzi, Marco Skarica, David White",ITF341800025,Игумнов
3,crawler_believe__34028360,Astronomia (Coffin Dance) [Dance Edit],Josh Nor,FR2X42061192,Victor Pool|Ruben den Boer|Антон Игумнов
4,crawler_believe__34168410,Astronomia (Coffin Dance) [Tequila Edit],Josh Nor,FR96X2013991,Victor Pool|Ruben den Boer|Антон Игумнов
5,crawler_fuga__7427128907609_1_6_ITZB42136782,Astronomia (Purple Mix),Josh Nor,ITZB42136782,Ruben Christopher Den Boer|Anton Igumnov|Victo...
...,...,...,...,...,...
55545,spotify_apidsr__3x99UdcqjXhQcqdgadKeXA,Belladoona,Masahiko Sato,GBRNP1400106,M. Sato
55546,youtube_dsr__A219026358613851,BELLADOONA,MASAHIKO SATO,GBRNP1400106,SATOU MASAHIKO
55547,spotify__3x99UdcqjXhQcqdgadKeXA,Belladoona,Masahiko Sato,GBRNP1400106,M. Sato
55548,spotify_apidsr__6ozP4Td6SyhEMhpqjocwQO,Belladoona,Masahiko Sato,GBRNP1400106,


In [4]:
groundtruth = pd.read_csv("groundtruth.csv")
groundtruth

Unnamed: 0,q_source_id,m_source_id,tag
0,spotify_apidsr__2NbYAPqE6FTyQte9kW4vgr,crawler_believe__26052217,invalid
1,crawler_believe__34028360,crawler_believe__34168410,valid
2,crawler_fuga__7427128907609_1_6_ITZB42136782,crawler_believe__42573832,valid
3,crawler_believe__34168476,spotify_apidsr__3kOHtCewbmdWgMVgJ8rpkC,invalid
4,spotify_apidsr__28JA0VuEMS8i3N6fpRXr2M,spotify_apidsr__1d6j1PD3Z8NqbCgCYKDbCy,invalid
...,...,...,...
28366,apple__1354975784,youtube_dsr__A461439239803827,valid
28367,apple__1052537885,crawler_pias__5060099505690_1_2_GBRNP1400106,valid
28368,crawler_247__5060099505690_GBRNP1400106,spotify_apidsr__3x99UdcqjXhQcqdgadKeXA,valid
28369,youtube_dsr__A219026358613851,spotify__3x99UdcqjXhQcqdgadKeXA,valid


#### Prepare dataset to train model

In [5]:
dataset = (groundtruth
           .set_index("q_source_id")
           .join(sr.rename(columns={"sr_id": "q_source_id"}).set_index("q_source_id"))
           .rename(columns={"title": "q_title", "artists": "q_artists", "isrcs": "q_isrcs", "contributors": "q_contributors"})
           .reset_index()
           .set_index("m_source_id")
           .join(sr.rename(columns={"sr_id": "m_source_id"}).set_index("m_source_id"))
           .rename(columns={"title": "m_title", "artists": "m_artists", "isrcs": "m_isrcs", "contributors": "m_contributors"})
           .reset_index()
          )

In [6]:
dataset

Unnamed: 0,m_source_id,q_source_id,tag,q_title,q_artists,q_isrcs,q_contributors,m_title,m_artists,m_isrcs,m_contributors
0,amazon_dsr__AB00QLM7UVI,crawler_247__3614591466960_FR10S1541945,invalid,Centuries,Jagerbombs,FR10S1541945,,Centuries,Fall Out Boy,USUM71412644,Jake Sinclair|Michael J. Fonesca|Justin Drew T...
1,amazon_dsr__AB07CNFFC62,spotify_apidsr__73F87Sqh6jQWucOOvz1WFx,valid,Genius,"Sia, Diplo, Labrinth, LSD",USQX91800798,Thomas Wesley Pentz|Timothy Mckenzie|Philip Me...,Genius,"LSD [feat. Sia, Diplo & Labrinth], LSD [feat ...",USQX91800836,"Thomas Pentz|Thomas Pentz, Jr.|Philip Mecksepe..."
2,amazon_dsr__AB07JBMWR4S,youtube_dsr__A988499812004440,invalid,FALLING,R3YAN & BENLON,TCAFK2148987,KIM CANDILORA|RYAN VOJTESAK|MARTIN KOTTMEIER|D...,Falling,TREVOR DANIEL,USUYG1221109,TREVOR DANIEL
3,amazon_dsr__AB07LDDZ1XG,apple__1541884162,valid,Fire on Fire,Sam Smith,GBUM71807533,Steve Mac|Sam Smith,"Fire On Fire (From ""Watership Down"")",Sam Smith,GBUM71807533,Steve Mac|Sam Smith
4,amazon_dsr__AB07ZKWPQBX,crawler_rfa_ndr__ndrhfdb1-11806275,valid,Forever,Lewis Capaldi,,Lewis Capaldi|Sean Douglas|Joe Janiak,Forever (Amazon Original),Lewis Capaldi,DEUM71906191,Lewis Capaldi
...,...,...,...,...,...,...,...,...,...,...,...
28366,youtube_dsr__xklJ8z_4e4w,crawler_emidigitalcontent__00610018948203_USSY...,invalid,Dream Weaver (Made Popular By Gary Wright) [Vo...,Party Tyme Karaoke,USSYS1102469,Gary Wright,DREAM WEAVER (VIOLIN AND GRAND PIANO),STEVEN HALPERN,US2V99710284,
28367,youtube_dsr__xmAw29prKKE,crawler_gdeasia__597175,valid,Angelina,Lou Bega,DEC730100131,,ANGELINA,LOU BEGA,DEC730100131,WEBENAU VON|KLEIST|PLETSCHACHER|[CA] KLEIST|LU...
28368,youtube_dsr__y6ohky8p2HM,spotify_apidsr__4Ly4FpM4zZELvAWrVjFez4,invalid,Fanfare Overture (Live),"Jonne Valtonen, Tokyo Philharmonic Orchestra, ...",SEWDL6139115,Jonne Valtonen,TWENTIETH CENTURY FOX FANFARE,ALFRED NEWMAN,USAR10400309,
28369,youtube_dsr__zPoEDu8dlxI,crawler_believe__33249194,valid,Tip-Toe Through the Tulips With Me,Ambrose and His Orchestra,DEKB71942669,Joe Burke,TIP-TOE THROUGH THE TULIPS WITH ME,AMBROSE AND HIS ORCHESTRA,NLB150900028,


##### Compute features

In [7]:
def conicidence(word1, word2):
    if word1 == word2:
        return 1
    return 0

In [8]:
dataset["title_sim"] = dataset.apply(lambda x: fuzz.ratio(x.q_title, x.m_title), axis=1)

In [9]:
dataset["artists_sim"] = dataset.apply(lambda x: fuzz.ratio(x.q_artists, x.m_artists), axis=1)

In [10]:
dataset["contributors_sim"] = dataset.apply(lambda x: fuzz.ratio(x.q_contributors, x.m_contributors), axis=1)

In [11]:
dataset["isrcs_coincidence"] = dataset.apply(lambda x: conicidence(x.q_isrcs, x.m_isrcs), axis=1)

##### Add independent variable

In [12]:
def y_tag(tag):
    if tag == "valid":
        return 1
    if tag == "invalid":
        return 0

In [13]:
dataset["y"] = dataset.apply(lambda x: y_tag(x.tag), axis=1)

In [14]:
ds = dataset[["m_source_id", "q_source_id", "title_sim", "artists_sim", "contributors_sim", "isrcs_coincidence", "y"]].set_index(["m_source_id", "q_source_id"])

In [15]:
ds

Unnamed: 0_level_0,Unnamed: 1_level_0,title_sim,artists_sim,contributors_sim,isrcs_coincidence,y
m_source_id,q_source_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
amazon_dsr__AB00QLM7UVI,crawler_247__3614591466960_FR10S1541945,100,18,0,0,0
amazon_dsr__AB07CNFFC62,spotify_apidsr__73F87Sqh6jQWucOOvz1WFx,100,52,41,0,1
amazon_dsr__AB07JBMWR4S,youtube_dsr__A988499812004440,14,37,24,0,0
amazon_dsr__AB07LDDZ1XG,apple__1541884162,46,100,100,1,1
amazon_dsr__AB07ZKWPQBX,crawler_rfa_ndr__ndrhfdb1-11806275,44,100,52,0,1
...,...,...,...,...,...,...
youtube_dsr__xklJ8z_4e4w,crawler_emidigitalcontent__00610018948203_USSYS1102469_R29,21,12,0,0,0
youtube_dsr__xmAw29prKKE,crawler_gdeasia__597175,12,38,0,1,1
youtube_dsr__y6ohky8p2HM,spotify_apidsr__4Ly4FpM4zZELvAWrVjFez4,12,5,0,0,0
youtube_dsr__zPoEDu8dlxI,crawler_believe__33249194,35,24,0,0,1


##### Fit model

In [16]:
X = ds[["title_sim", "artists_sim", "contributors_sim", "isrcs_coincidence"]]
y = ds[["y"]]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

  clf.fit(X_train, y_train)


In [19]:
filename = "model.pickle"
pickle.dump(clf, open(filename, "wb"))

##### Test model

In [20]:
y_pred = clf.predict(X_test)

In [21]:
y_pred[0]

1

In [22]:
accuracy_score(y_test, y_pred)

0.9418989640072626

In [23]:
X_test["y"] = y_test
X_test["y_pred"] = y_pred

In [24]:
X_test

Unnamed: 0_level_0,Unnamed: 1_level_0,title_sim,artists_sim,contributors_sim,isrcs_coincidence,y,y_pred
m_source_id,q_source_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
spotify_apidsr__43a0zndj9PKOZY7bJ6Z7MR,youtube_dsr__A814771753369564,43,25,12,1,1,1
spotify_apidsr__2UNke6TriCgvLccPmYUiRQ,crawler_empire_ftp__888915925447_1_2_USUYG1257926,100,91,100,0,1,1
apple__1524808074,youtube_dsr__A945231354534201,14,9,0,0,0,0
spotify_apidsr__4T3G9kwx4oGgWbEI6GqvPI,crawler_fuga__7330065154114_1_1_SE5IB2206689,100,24,60,0,0,0
crawler_new247__US4K30600024,crawler_warnerdigitalcontent__190295323141_1_13_ITV951900009,100,19,0,0,0,0
...,...,...,...,...,...,...,...
spotify_apidsr__7N1SthvfHhGhV9C9pkPGIa,crawler_warnerdigitalcontent__5054197201295_1_20_USAT22103652,100,0,44,0,0,0
spotify_apidsr__5ZzbaEl28x5epxXB7GJIA9,crawler_eolasia__386193,54,18,100,0,0,0
crawler_believe__43789748,crawler_believe__30367209,67,40,16,0,0,0
youtube_dsr__A217649949174811,crawler_toolostcommercial_ftp__0195918523694_1_1_QZPLR2034079,100,27,0,1,1,1
