## Data Integration

This notebook includes steps for integrating of two tables 'tracks_sample.csv' and 'songs_sample.csv' based on the matching pairs. These two table has two different schemas. Thus, schema of the final table E is the union of these two table's schema.

### Loading libraries and reading data

In [2]:
import pandas as pd
import os
import re

songs = pd.read_csv('dataset/songs_sample.csv')
tracks = pd.read_csv('dataset/tracks_sample.csv')
matchIDPairs = pd.read_csv('dataset/labeled_data_stage4.csv')

# filtering the matched tuples from both dataset 
matchedTracks = tracks[tracks['id'].isin(list(matchIDPairs['ltable_id']))]
matchedSongs = songs[songs['id'].isin(list(matchIDPairs['rtable_id']))]

#assert(len(matchedTracks)==len(matchedSongs))
len(matchedTracks), len(matchedSongs)

(13316, 14323)

In [3]:
len(set(matchIDPairs['ltable_id'])), len(set(matchIDPairs['rtable_id'])), len(matchIDPairs)

(13316, 14323, 32000)

In [4]:
matchedTracks.head()

Unnamed: 0,id,movie_title,year,episode,song_title,artists
3,655369,the dark chronicles,2011.0,,in the pines,the stanley brothers
7,262157,the porter wagoner show,1961.0,the osborne brothers (#1.517),strawberry song,barbara lea
10,262158,the porter wagoner show,1961.0,the osborne brothers (#1.517),the carroll county accident,porter wagoner
11,524309,lou,2010.0,,free,olivia waithe+eliseus joseph jr+julian griffit...
12,393239,class of 1984,1982.0,,aint got no sense,teenage head


In [5]:
matchedSongs.head()

Unnamed: 0,id,song_title,artists,year
4,543601,we will rock you,andrew spencer,0
16,418130,robot lover,jack oblivian,0
35,822820,in that number,the perrys,0
39,473233,so far away,john lord fonda,2004
46,766296,child of the ghetto (album version),g. dep,0


In [6]:
matchIDPairs.head()

Unnamed: 0,_id,ltable_id,rtable_id,ltable_song_title,ltable_year,ltable_artists,rtable_song_title,rtable_year,rtable_artists,gold_labels
0,7992,172624,543601,you will find me,2003.0,andrew ripp,we will rock you,0,andrew spencer,0
1,16183,634097,418130,sweet lover,1989.0,jack green+ian orkin,robot lover,0,jack oblivian,0
2,30794,392501,822820,in time,2015.0,find the others,in that number,0,the perrys,0
3,37332,611536,473233,over the hills and far away,1995.0,john tams,so far away,2004,john lord fonda,0
4,37484,723458,473233,carried away,2006.0,john dickson,so far away,2004,john lord fonda,0


### Merging two tables 

In [8]:
import math

#Schema of the merged table
E = pd.DataFrame(columns = ['movie_title','year','episode','song_title','artists'])

for index, row in matchIDPairs.iterrows(): 
    left_entry = matchedTracks[matchedTracks['id']==row['ltable_id']]
    right_entry = matchedSongs[matchedSongs['id']==row['rtable_id']]
    
    assert(len(left_entry)==1)
    assert(len(right_entry)==1)
    
    track_id = int(left_entry['id'].item())
    song_id = int(right_entry['id'].item())
    
    if(math.isnan(left_entry['year'].item())):
        left = 0
    else:
        left = int(left_entry['year'].item())
    
    if(math.isnan(right_entry['year'].item())):
        right = 0
    else:
        right = int(right_entry['year'].item())
    
    if left != right:
        if left != 0 and right != 0:
            year = [left,right]
        elif left != 0:
            year = left
        else:
            year = right
    if left == right:
        year = right
    
    #for song title, larger length value is chosen if two value doesn't have exact string match
    left = str(left_entry['song_title'].item())
    right = str(right_entry['song_title'].item())
    
    if len(left) >= len(right):
        song_title = left
    else:
        song_title = right
    
    #for artist, larger length value is chosen if two value doesn't have exact string match
    left = str(left_entry['artists'].item())
    right = str(right_entry['artists'].item())
    
    if len(left) >= len(right):
        artists = left
    else:
        artists = right
    
    #since movie and episode are unique attributes in the left table, keeping the value as it is
    movie_title = str(left_entry['movie_title'].item())
    episode = str(left_entry['episode'].item())
    
    if episode == 'NaN':
        episode = ''
    
    #creating an entry for table E with all values
    entry = pd.Series([track_id, song_id, movie_title, year, episode, song_title, artists], index=['track_id','song_id','movie_title','year','episode','song_title','artists'])
    
    #appending the merged value to table E
    E = E.append(entry, ignore_index=True)

In [9]:
E.head()

Unnamed: 0,movie_title,year,episode,song_title,artists,song_id,track_id
0,the pledge,"[2001, 1996]",,poor twisted me,james hetfield+lars ulrich+metallica+arrangeme...,511255.0,678831.0
1,william s. burroughs: commissioner of sewers,"[1991, 1990]",,batman br�t fische,fm einheit,150981.0,724999.0
2,the warriors,"[2005, 1979]",,love is a fire,genya ravan+johnny vastano+vini poncia,328251.0,690267.0
3,t in the park 2010,2010,muse/calvin harris (#1.3),map of the problematique [live from wembley st...,matthew bellamy+muse,227686.0,231063.0
4,dolly parton: live & well,"[2004, 2002]",,dagger through the heart,dolly parton,531984.0,418267.0


In [10]:
#Writing the table E to file
E.to_csv('merged_data.csv',sep=',',index=False)