## Same class and a working example

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# import nltk
# from nltk.corpus import stopwords
import glob
# stop = stopwords.words('english')
from collections import defaultdict

class String_Matcher():
    """
    Class uses the Levenstein distance to find similar strings.
    Quick and dirty analysis shows that order of words doesn't make much difference, as well as, a removal of punctuation and stop-words.
    However, light cleaning is possible via clean_column()

    How to use:
    sm = String_Matcher(df)
    l = sm.created_matched_list('item_title', 'ipod') will create a list with the top 10 matches for the query 'ipod' from the column df['item_title'] 
    matched_df = sm.create_matched_df('item_title') will create a dataframe with two columns. The first column are item titles, the second column are the top 10
    matches selected from the same column. 
    """

    def __init__(self, df):
        """
        df: pd.DataFrame
        """
        self.df = df

    def clean_column(self, col_name):
        """
        (Optional: doesn't seem to change the matching quality much)
        Takes column 'col_name' and creates a new column 'col_name_clean'
        where all characters are in the lower case and stop words (using nltk.corpus) are removed

        col_name: string
        """
        self.df[col_name + str('_clean')] = self.df[col_name].str.lower()
        self.df[col_name + str('_clean')].apply(lambda x: [item for item in x if item not in stop])
        return self.df


    def create_matched_list(self, col_name, str_to_match, topN = 10):
        """
        Returns topN matches as a list of strings from a column df[col_name] for a provided string str_to_match
        One caveat: it looks across all the files (because they're concatenated into one dataframe) that might or might not
        be a good idea depending on the context (for files containing info about different categories, it's probably bad).
        But building a logic for separate files was taking time.

        col_name: string
        str_to_match: string
        topN (optional): integer. Default is 10
        """
        res, choices, output = [], [], []
        for item in self.df[col_name].unique(): 
            choices.append(item)
        res = process.extract(str_to_match, choices, limit = topN)
        for line in res:
            output.append(line[0])
        return output


    def create_matched_df(self, col_name, topN):
        """
        Returns pd.DataFrame with two columns: 'item_title' and 'matches.'
        For every item_title, there are 10 matches. IMPORTANT: it considers only unique values.
        If a dataframe consists of 10 titles but two of them are similar, it will return 90 (9 * 10) rows. This happens to the mp3 dataset
        One caveat: it looks across all the files (because they're concatenated into one dataframe) that might or might not
        be a good idea depending on the context (for files containing info about different categories, it's probably bad).
        But building a logic with separate files was taking time.
        The provided score takes care of punctuation and word order.

        col_name: string
        topN (optional): integer. Default is 10
        """
        d = defaultdict(list)
        for k, v in enumerate(self.df[col_name].unique()):
            curr_res = []
            choices = list(self.df[col_name].iloc[:k]) + list(self.df[col_name].iloc[k + 1:])
            curr_res = process.extract(v, choices, limit = topN, scorer = fuzz.token_sort_ratio)
            for ix in range(len(curr_res)):
                d[v].append(curr_res[ix][0])
        df_from_d = pd.DataFrame.from_dict(d, orient = 'index')
        df_from_d_stacked = df_from_d.stack().reset_index()
        df_from_d_stacked.drop('level_1', axis = 1, inplace = True)
        df_from_d_stacked.rename(columns = {"level_0": "item_id", 0: "matches"}, inplace = True)
        return df_from_d_stacked


        def save_to_csv(self, df, filepath, filename):
            """
            Saves the provided dataset to the folder defined in 'filepath' with the name defined in 'filename'

            df: pd.DataFrame
            filepath: string
            filename: string
            """
            df.to_csv(filepath + "/" + filename)


In [2]:
# Load your dataset
df = pd.read_csv('/home/mkareev/small_files/mp3_small.csv')

In [3]:
# Instantiate a class with a dataframe
sm = String_Matcher(df)

In [4]:
# self.df now has access to df. Column "item_title_clean" was created by clean_column() below
sm.df

Unnamed: 0.1,Unnamed: 0,item_id,site,category_id,item_title,item_title_clean
0,1211,221354177429,0,73839,"Apple iPod nano 5th Generation Silver (8 GB) ""...","apple ipod nano 5th generation silver (8 gb) ""..."
1,15224,321205299017,0,73839,Micro SD Card Slot Volume Control Clip Mini MP...,micro sd card slot volume control clip mini mp...
2,22684,370991324691,0,73839,Sony FMP-X1 4K (2 TB) Digital Media Player,sony fmp-x1 4k (2 tb) digital media player
3,3342,301080217877,0,73839,"1.8""HIP Street HS-T29A-4GBMX Crossfade (4 GB) ...","1.8""hip street hs-t29a-4gbmx crossfade (4 gb) ..."
4,12409,350975306482,0,73839,PyleHome 4GB Waterproof High Speed USB MP3 And...,pylehome 4gb waterproof high speed usb mp3 and...
...,...,...,...,...,...,...
745,7589,321298712585,0,73839,Apple iPod Touch 2nd Generation 8GB Black Ful...,apple ipod touch 2nd generation 8gb black ful...
746,408,291039799845,0,73839,1x 8G 8GB 3.0MP Camera 2.8in. LCD Touch Screen...,1x 8g 8gb 3.0mp camera 2.8in. lcd touch screen...
747,19172,250362115685,0,73839,Sunglass Black Headset Sun glass 1GB & Mp3 Pla...,sunglass black headset sun glass 1gb & mp3 pla...
748,344,301075462019,0,73839,Apple iPod nano 7th Generation Yellow (16 GB) ...,apple ipod nano 7th generation yellow (16 gb) ...


In [None]:
# We can clean the column if needed:
sm.clean_column('item_title')

In [5]:
# We can pass a query and return top 15 most similar items:
sm.create_matched_list('item_title', 'Waterproof', 15)

['PyleHome 4GB Waterproof High Speed USB MP3 And WMA Player With Head',
 'Waterproof Underwater Swim Sports 2GB Music MP3 Player',
 '```` 8GB Waterproof MP3 Player `````',
 'PYLE-HOME PBTW20BL SURF SOUND BLUE WATERPROOF BLUETOOTH SPEAKER WITH AUX INPUT',
 'IPX8 Waterproof Underwater 4GB FM MP3 Player LCD Display Aluminum OLED Screen ',
 '8GB Waterproof MP3 Player for Swimming Sport  FM Radio  Underwater Sports MP3',
 'Waterproof Sony NWZ-W273BLK Black (4 GB) Digital Media Player (New w/Packaging)',
 'NEW Pyle PSWB4BL 4GB Waterproof Neckband Headphone MP3 Player for Swimming Blue',
 '4GB Clip Waterproof Mp3 Player IPX8 with FM Radio Swimming/Diving Sports Blue  ',
 'New USB  4GB Waterproof Columnar Swimming Diving MP3 Player Water Resistant Rose',
 '4GB Black Waterproof MP3 Music Player Water Sports Swimming Diving FM Radio',
 'Pink Swimming Diving Water Waterproof MP3 Player FM Radio Earphone 4GB New',
 '4GB USB Waterproof MP3 Player w/ FM Radio for Swimming Diving Spa Water Sports',
 

In [7]:
# We can find top 5 most similar items for every unique item:
sm.create_matched_df('item_title', 15)

Unnamed: 0,item_id,matches
0,"Apple iPod nano 5th Generation Silver (8 GB) ""...",BRAND NEW SEALED APPLE RED SPECIAL EDITION 7TH...
1,"Apple iPod nano 5th Generation Silver (8 GB) ""...",Apple iPod nano 6th Generation Silver (8 GB)
2,"Apple iPod nano 5th Generation Silver (8 GB) ""...",Apple iPod nano 6th Generation Silver (16 GB) ...
3,"Apple iPod nano 5th Generation Silver (8 GB) ""...",Apple iPod nano 4th Generation chromatic Silve...
4,"Apple iPod nano 5th Generation Silver (8 GB) ""...",NEW Apple iPod nano 6th Generation Blue (8 GB)
...,...,...
10060,Transcend MP3 Player T.sonic MP870 8GB White T...,8GB Slim Mp3 Mp4 Player 8 Colors 1.8 LCD Scree...
10061,Transcend MP3 Player T.sonic MP870 8GB White T...,SanDisk Sansa Fuze FM White 8 GB Digital Medi...
10062,Transcend MP3 Player T.sonic MP870 8GB White T...,Black Tape Shaped Mp3 Player EM#01
10063,Transcend MP3 Player T.sonic MP870 8GB White T...,8GB Waterproof MP3 Player for Swimming Sport ...


As you see, the dataset has 750 rows but the above we're getting only 10065 rows instead of 11250 (750 times 15). It means that titles of 80 items were exactly the same. As a result, we took this title only once and returned 15 matches for it also only once. We discarded the other 79 (11250 - 10065 == 79 * 15)

Finally, we can save the resulting dataset:

In [9]:
result_df = sm.create_matched_df('item_title', topN = 15)

In [10]:
result_df

Unnamed: 0,item_id,matches
0,"Apple iPod nano 5th Generation Silver (8 GB) ""...",BRAND NEW SEALED APPLE RED SPECIAL EDITION 7TH...
1,"Apple iPod nano 5th Generation Silver (8 GB) ""...",Apple iPod nano 6th Generation Silver (8 GB)
2,"Apple iPod nano 5th Generation Silver (8 GB) ""...",Apple iPod nano 6th Generation Silver (16 GB) ...
3,"Apple iPod nano 5th Generation Silver (8 GB) ""...",Apple iPod nano 4th Generation chromatic Silve...
4,"Apple iPod nano 5th Generation Silver (8 GB) ""...",NEW Apple iPod nano 6th Generation Blue (8 GB)
...,...,...
10060,Transcend MP3 Player T.sonic MP870 8GB White T...,8GB Slim Mp3 Mp4 Player 8 Colors 1.8 LCD Scree...
10061,Transcend MP3 Player T.sonic MP870 8GB White T...,SanDisk Sansa Fuze FM White 8 GB Digital Medi...
10062,Transcend MP3 Player T.sonic MP870 8GB White T...,Black Tape Shaped Mp3 Player EM#01
10063,Transcend MP3 Player T.sonic MP870 8GB White T...,8GB Waterproof MP3 Player for Swimming Sport ...


In [9]:
sm.save_to_csv(result_df, '/home/mkareev/small_files', '12_18_mp3_result.csv')