### Importing modules

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz

import warnings
warnings.filterwarnings('ignore') # supress warnings



### Module to cluster duplicates together

In [2]:
# Creating a dummy dataframe

# adding duplicate company names to cluster the id's into one list
data = {'id':  ['1', '2', '3', '4', '5'],
        'name': ['company1', 'company2','company1', 'company3', 'company1'],
        }
data = pd.DataFrame (data, columns = ['name', 'id'])

def cluster_duplicates(df, company_name_col, customer_id_col):

    df[customer_id_col] = df[[customer_id_col, company_name_col]].groupby([company_name_col])[customer_id_col].transform(lambda x: ','.join(x))
    df = df.sort_values(company_name_col)
    df = df[[company_name_col, customer_id_col]]
    df = df.drop_duplicates(subset=[company_name_col])
    
    return df.reset_index(drop = True)

# Calling function
data_dup_cluster = cluster_duplicates(data, 'name', 'id')

In [3]:
data_dup_cluster.head()

Unnamed: 0,name,id
0,company1,135
1,company2,2
2,company3,4


### Module to manually replace a name and cluster back to the parent company

In [4]:
def merge_manual_replace_id(df_name, parent_name, merge_names, company_name_col, customer_id_col):
    
    name_replace_dict = {}
    for i in merge_names:
        name_replace_dict.update({i:parent_name})

    df_name[company_name_col] = df_name[company_name_col].replace(name_replace_dict)

    return cluster_duplicates(df_name, company_name_col, customer_id_col)

# Calling function
manual_merge_cluster = merge_manual_replace_id(data_dup_cluster, 'company1', ['company2'], 'name', 'id')

In [5]:
manual_merge_cluster.head()

Unnamed: 0,name,id
0,company1,1352
1,company3,4


### Module to attempt stage 2 clustering where names greater than 3 words are checked for a perfect match with names having 3 words and clustered together

In [2]:
data = {'id':  ['1', '2', '3', '4', '5'],
        'name': ['company1 is yes', 'company2 is very great','company1 is', 'company3', 'company1'],
        }

df = pd.DataFrame (data, columns = ['id','name'])

In [3]:
def f(x):
    return len(x['name'].split()) 

df['length'] = df.apply(f, axis = 1)

In [4]:
df_gt2 = df[df['length'] > 2]

In [5]:
df_2 = df[df['length']==2]

df_gt2['0'] = 0
df_2['0'] = 0

df = pd.merge(df_2, df_gt2, on = '0')

def f(x):
    return fuzz.partial_ratio(x['name_x'], x['name_y'])

df['score'] = df.apply(f, axis = 1)