In [None]:
https://medium.com/tim-black/fuzzy-string-matching-at-scale-41ae6ac452c2

In [2]:
# Load libraries
import re
import time
import operator

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
import pandas as pd

import sparse_dot_topn.sparse_dot_topn as ct

# A class for matching one list of strings to another
class StringMatch():
    
    def __init__(self, source_names, target_names):
        self.source_names = source_names
        self.target_names = target_names
        self.ct_vect      = None
        self.tfidf_vect   = None
        self.vocab        = None
        self.sprse_mtx    = None
        
        
    def tokenize(self, analyzer='char_wb', n=3):
        '''
        Tokenizes the list of strings, based on the selected analyzer
        :param str analyzer: Type of analyzer ('char_wb', 'word'). Default is trigram
        :param str n: If using n-gram analyzer, the gram length
        '''
        # Create initial count vectorizer & fit it on both lists to get vocab
        self.ct_vect = CountVectorizer(analyzer=analyzer, ngram_range=(n, n))
        self.vocab   = self.ct_vect.fit(self.source_names + self.target_names).vocabulary_
        
        # Create tf-idf vectorizer
        self.tfidf_vect  = TfidfVectorizer(vocabulary=self.vocab, analyzer=analyzer, ngram_range=(n, n))
        
        
    def match(self, ntop=1, lower_bound=0, output_fmt='df'):
        '''
        Main match function. Default settings return only the top candidate for every source string.
        
        :param int ntop: The number of top-n candidates that should be returned
        :param float lower_bound: The lower-bound threshold for keeping a candidate, between 0-1.
                                   Default set to 0, so consider all canidates
        :param str output_fmt: The output format. Either dataframe ('df') or dict ('dict')
        '''
        self._awesome_cossim_top(ntop, lower_bound)
        
        if output_fmt == 'df':
            match_output = self._make_matchdf()
        elif output_fmt == 'dict':
            match_output = self._make_matchdict()
            
        return match_output
        
        
    def _awesome_cossim_top(self, ntop, lower_bound):
        ''' https://gist.github.com/ymwdalex/5c363ddc1af447a9ff0b58ba14828fd6#file-awesome_sparse_dot_top-py '''
        # To CSR Matrix, if needed
        A = self.tfidf_vect.fit_transform(self.source_names).tocsr()
        B = self.tfidf_vect.fit_transform(self.target_names).transpose().tocsr()
        M, _ = A.shape
        _, N = B.shape

        idx_dtype = np.int32

        nnz_max = M * ntop

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)

        self.sprse_mtx = csr_matrix((data,indices,indptr), shape=(M,N))
    
    
    def _make_matchdf(self):
        ''' Build dataframe for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # COO matrix to list of tuples
        match_list = []
        for row,col,val in zip(cx.row, cx.col, cx.data):
            match_list.append((row, self.source_names[row], col, self.target_names[col], val))

        # List of tuples to dataframe
        colnames = ['Row Idx', 'Title', 'Candidate Idx', 'Candidate Title', 'Score']
        match_df = pd.DataFrame(match_list, columns=colnames)

        return match_df

    
    def _make_matchdict(self):
        ''' Build dictionary for result return '''
        # CSR matrix -> COO matrix
        cx = self.sprse_mtx.tocoo()

        # dict value should be tuple of values
        match_dict = {}
        for row,col,val in zip(cx.row, cx.col, cx.data):
            if match_dict.get(row):
                match_dict[row].append((col,val))
            else:
                match_dict[row] = [(col, val)]

        return match_dict   

In [4]:
billboard =  pd.read_csv('~/Desktop/portfolio/billboard/data/billboardpairs.csv')
billboard_new = pd.DataFrame(data=billboard)
pairbillboard = billboard_new['pair'].to_list()

pairbillboard




['Justin Bieber - Love Yourself',
 'Justin Bieber - Sorry',
 'Drake feat. WizKid and Kyla - One Dance',
 'Rihanna feat. Drake - Work',
 'twenty one pilots - Stressed Out',
 'Desiigner - Panda',
 'Adele - Hello',
 'Chainsmokers feat. Daya - Don’t Let Me Down',
 'Justin Timberlake - Can’t Stop The Feeling!',
 'Chainsmokers feat. Halsey - Closer',
 'Sia feat. Sean Paul - Cheap Thrills',
 'Lukas Graham - 7 Years',
 'Rihanna - Needed Me',
 'Flo Rida - My House',
 'Mike Posner - I Took A Pill In Ibiza',
 'Fifth Harmony feat. Ty Dolla $ign - Work From Home',
 'Calvin Harris feat. Rihanna - This Is What You Came For',
 'DNCE - Cake By The Ocean',
 'G-Eazy x Bebe Rexha - Me, Myself and I',
 'twenty one pilots - Ride',
 'twenty one pilots - Heathens',
 'Zayn - Pillowtalk',
 'Shawn Mendes - Stitches',
 'Drake - Hotline Bling',
 'Major Lazer feat. Justin Bieber and MO - Cold Water',
 'Adele - Send My Love (To Your New Lover)',
 'Chainsmokers feat. Rozes - Roses',
 'Shawn Mendes - Treat You Better'

In [8]:
genius =  pd.read_csv('~/Desktop/portfolio/billboard/data/geniuspairs.csv')
genius_new = pd.DataFrame(data=genius)
pairgenius = genius_new['pair'].to_list()

pairgenius


['Justin Bieber - Love Yourself',
 'XXXTENTACION - \u200blove yourself (interlude)',
 'Futuristic & Devvon Terrell - Love Yourself (Remix)',
 'BTS - Love Yourself: 轉 Tear Notes',
 'Yo Preston - Love Yourself vs F*CK Yourself',
 'BTS - Love Yourself: 結 Answer Notes',
 'Ne-Yo - Let Me Love You (Until You Learn to Love Yourself)',
 'Mary J. Blige - Love Yourself',
 'Emicii - Love Yourself',
 'BTS - Love Yourself: 承 Her Notes',
 'Justin Bieber - Sorry',
 'Demi Lovato - Sorry Not Sorry',
 'Beyoncé - Sorry',
 'Bryson Tiller - Sorry Not Sorry',
 'Joyner Lucas - I’m Sorry',
 'Halsey - Sorry',
 'Rick Ross - Sorry',
 'Kikuo-p - ごめんね ごめんね (I’m Sorry, I’m Sorry)',
 'T.I. - Sorry',
 'Akon - Sorry, Blame It On Me',
 'Drake - One Dance',
 'Justin Bieber - One Dance (Remix)',
 'Dance Gavin Dance - Frozen One',
 'Drake - One Dance (Dub)',
 'Hi-Rez - One Dance (Drake Remix)',
 'Devvon Terrell - One Dance (Remix)',
 'KIDZ BOP Kids - One Dance',
 'Alex Aiono - One Dance / Hasta El Amanecer (Mashup)',
 'Co

In [9]:
spotify =  pd.read_csv('~/Desktop/portfolio/billboard/data/spotifypairs.csv')
spotify_new = pd.DataFrame(data=spotify)
pairspotify = spotify_new['pair'].to_list()

pairspotify 

['Justin Bieber - Love Yourself',
 'XXXTENTACION - love yourself (interlude)',
 'Anthony Gallway - Love Yourself',
 'Phora - Love Yourself',
 'Sufjan Stevens - Love Yourself',
 'Twinkle Twinkle Little Rock Star - Love Yourself',
 'Larry Eagler - Love Yourself',
 'Grayscale - Love Yourself',
 'Megan Davies - Love Yourself, Out of the Woods, Roses - Acoustic Mashup',
 'Rome Fortune - Love Yourself',
 'Toro y Moi - Love Yourself',
 'ピョートル(Vo. J R Price) - Love Yourself',
 'Conor Maynard - Love Yourself',
 'Dylan Scott - Love Yourself',
 'The Theorist - Love Yourself (Piano Arrangement)',
 'Yo Preston - Love Yourself vs. F*CK Yourself - Love Yourself Response',
 'Kelly Kiara - Love Yourself vs. F*CK Yourself - Love Yourself Response',
 'Mary J. Blige - Love Yourself',
 'Kanye West - Love Yourself',
 'Robert Mendoza - Love Yourself',
 'Nio Soul - Love Yourself',
 'William Singe - Love Yourself',
 'Tanner Townsend - Love Yourself',
 'Justin Bieber - Love Yourself',
 'BTS - Euphoria',
 'BTS -

In [10]:
billboard_spotify_match = StringMatch(pairbillboard, pairspotify)
billboard_spotify_match.tokenize()
billboard_spotify_df = billboard_spotify_match.match()

In [57]:
#billboard_spotify_df.to_csv ('~/Desktop/portfolio/projects/Billboard/billboard_spotify_df.csv', index = False, header=True)

In [54]:
billboard_genius_match = StringMatch(pairbillboard, pairgenius)
billboard_genius_match.tokenize()
billboard_genius_df = billboard_genius_match.match()

In [56]:
#billboard_genius_df.to_csv ('~/Desktop/portfolio/projects/Billboard/billboard_genius_df.csv', index = False, header=True)