# **Category Cleanup**

## Importing packages

In [1]:
import pandas as pd
import numpy as np
import re
from difflib import SequenceMatcher, get_close_matches

import warnings
warnings.filterwarnings("ignore")

In [2]:
# loading the product list
iprocure_prods_df = pd.read_excel('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/product_list.xlsx')
cols = iprocure_prods_df.columns[:15]
iprocure_prods_df = iprocure_prods_df[cols]
iprocure_prods_df.drop(['Unit Cost', 'Unnamed: 10', 'Active'], axis = 1, inplace = True)

# dropping product_name duplicates
iprocure_prods_df = iprocure_prods_df.drop_duplicates(subset=['Product Name'], keep='first').reset_index(drop=True)
# iprocure_prods_df['combined_product_name'] = iprocure_prods_df[['Product Name', 'Manufacturer']].apply(lambda x: '; '.join(x.fillna('').astype(str)) if x.notna().any() else '', axis = 1)

pd.set_option('display.max_columns',None)
iprocure_prods_df.head()

Unnamed: 0,Category,Product Name,Type,Weight,Carton Size,Distributor,Product Code,Product Id,Distributor Type,Manufacturer,Industry,Sub category
0,Minerals and Supplements,Aviboost Aqua Block,1kg,1.0,1,Vital Animal Health,ANP001A,2672,1,Vital Animal Health,Agriculture,
1,Minerals and Supplements,Aviboost CL-X Blue,1lt,1.0,1,Vital Animal Health,ANP061A,2674,1,Vital Animal Health,Agriculture,
2,Minerals and Supplements,Aviboost Nutri Block,1kg,1.0,1,Vital Animal Health,ANP062A,2676,1,Vital Animal Health,Agriculture,
3,Minerals and Supplements,Aviboost Spectrum,1lt,1.0,1,Vital Animal Health,ANP063A,2677,1,Vital Animal Health,Agriculture,
4,Minerals and Supplements,Aviboost Poultry Tonic,1lt,1.0,1,Vital Animal Health,ANP003A,2678,1,Vital Animal Health,Agriculture,


In [3]:
# size of dataset
iprocure_prods_df.shape

(4771, 12)

In [4]:
# loading category data
category_df = pd.read_csv('/home/natasha/Documents/Iprocure/Clustering-for-Product-Matching/data/data_v2/category_data.csv')

# dropping duplicates and renaning col
category_df = category_df.drop_duplicates(subset=['correct_product_match'], keep='first').reset_index(drop=True)
category_df.rename(columns={'correct_product_match': 'product_name'},inplace=True)
category_df.head()

Unnamed: 0,product_name,category_name,sub_category
0,magazine east african,locall,
1,absolute,local,
2,pole double c63,MCBs,
3,starter j,unga,
4,mola feeds,malezi,


In [5]:
# size of dataset
category_df.shape

(22596, 3)

In [6]:
category_df.isna().sum()

product_name         0
category_name        3
sub_category     22409
dtype: int64

In [7]:
# changing product name and category columns to lowercase
iprocure_prods_df[['Category', 'Product Name']] = iprocure_prods_df[['Category', 'Product Name']].applymap(lambda x: str(x).lower())
category_df[['category_name', 'product_name']] = category_df[['category_name', 'product_name']].applymap(lambda x: str(x).lower())

# renaming product name
iprocure_prods_df.rename(columns={'Product Name': 'product_name'}, inplace=True)
iprocure_prods_df.head()

Unnamed: 0,Category,product_name,Type,Weight,Carton Size,Distributor,Product Code,Product Id,Distributor Type,Manufacturer,Industry,Sub category
0,minerals and supplements,aviboost aqua block,1kg,1.0,1,Vital Animal Health,ANP001A,2672,1,Vital Animal Health,Agriculture,
1,minerals and supplements,aviboost cl-x blue,1lt,1.0,1,Vital Animal Health,ANP061A,2674,1,Vital Animal Health,Agriculture,
2,minerals and supplements,aviboost nutri block,1kg,1.0,1,Vital Animal Health,ANP062A,2676,1,Vital Animal Health,Agriculture,
3,minerals and supplements,aviboost spectrum,1lt,1.0,1,Vital Animal Health,ANP063A,2677,1,Vital Animal Health,Agriculture,
4,minerals and supplements,aviboost poultry tonic,1lt,1.0,1,Vital Animal Health,ANP003A,2678,1,Vital Animal Health,Agriculture,


In [8]:
category_df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)
category_df.shape

(22596, 3)

In [11]:
category_df = category_df.merge(iprocure_prods_df[['product_name', 'Category', 'Sub category', ]], on='product_name', how='left')
category_df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)
category_df.head()

Unnamed: 0,product_name,category_name,sub_category,Category,Sub category
0,magazine east african,locall,,,
1,absolute,local,,agrochemicals,Fungicide
2,pole double c63,mcbs,,,
3,starter j,unga,,,
4,mola feeds,malezi,,feeds,


In [12]:
category_df.shape

(22596, 5)

In [13]:
category_df.isna().sum()

product_name         0
category_name        0
sub_category     22409
Category         19228
Sub category     21439
dtype: int64

### Cleaning category column

In [14]:
category_df['Category'] = np.where(category_df['Category'].isna(), category_df['category_name'], category_df['Category'])
category_df.head()

Unnamed: 0,product_name,category_name,sub_category,Category,Sub category
0,magazine east african,locall,,locall,
1,absolute,local,,agrochemicals,Fungicide
2,pole double c63,mcbs,,mcbs,
3,starter j,unga,,unga,
4,mola feeds,malezi,,feeds,


In [15]:
category_df.isna().sum()

product_name         0
category_name        0
sub_category     22409
Category             0
Sub category     21439
dtype: int64

In [16]:
# cleaning category column
categories = iprocure_prods_df['Category'].unique().tolist()

wrong_categories_df = category_df[~category_df['Category'].isin(categories)]
wrong_categories_df = wrong_categories_df.drop_duplicates(subset='Category', keep='first')

# cleanup function
def compare(i):
    comparison = {}
    if isinstance(i, str):
       comparison.update({i: get_close_matches(i, categories, n=1, cutoff=0.1)})
    category = list(comparison.keys()) if comparison else None
    match = []
    score = []
    if comparison:
       for key, value in comparison.items():
           if value:
              match.append(value[0])
              score.append(round(SequenceMatcher(None, i, value[0]).ratio(), 2))
           else:
              match.append(None)
              score.append(None)
    else:
       match.append(None)
       score.append(None)
              
    return pd.Series([category, match, score], index = ['category', 'match', 'score'])

cleaned_categories_df = pd.DataFrame()
cleaned_categories_df[['category', 'match', 'score']] = wrong_categories_df['Category'].apply(lambda x: compare(x))
cleaned_categories_df = cleaned_categories_df.applymap(lambda x: x[0] if x else '')
cleaned_categories_df.head()

Unnamed: 0,category,match,score
0,locall,immunologicals,0.5
2,mcbs,immunologicals,0.33
3,unga,gear,0.5
5,tabs,transmission,0.38
6,tabz,sanitation,0.29


In [17]:
# replacing incorrect category names
category_matches_df = cleaned_categories_df[cleaned_categories_df['score'] >= 0.7]
category_matches_df = category_matches_df.rename(columns={'category': 'Category'})
category_matches_df.head()

Unnamed: 0,Category,match,score
46,agrochemical,agrochemicals,0.96
70,feed,feeds,0.89
146,whisky,whiskey,0.92
180,minerals and supplements1,minerals and supplements,0.98
187,agrochemicalsm,agrochemicals,0.96


In [18]:
category_df = category_df.merge(category_matches_df[['Category', 'match']], how='left', on='Category')
category_df['match'] = np.where(category_df['match'].isna(), category_df['Category'], category_df['match'])
category_df = category_df.drop(['category_name', 'Category'], axis = 1).\
    rename(columns={'match': 'category_name'})
category_df.head()

Unnamed: 0,product_name,sub_category,Sub category,category_name
0,magazine east african,,,locall
1,absolute,,Fungicide,agrochemicals
2,pole double c63,,,mcbs
3,starter j,,,unga
4,mola feeds,,,feeds


In [19]:
category_df.isna().sum()

product_name         0
sub_category     22409
Sub category     21439
category_name        0
dtype: int64

### Cleaning sub-category column

In [20]:
category_df['Sub category'] = np.where(category_df['Sub category'].isna(), category_df['sub_category'], category_df['Sub category'])
category_df.head()

Unnamed: 0,product_name,sub_category,Sub category,category_name
0,magazine east african,,,locall
1,absolute,,Fungicide,agrochemicals
2,pole double c63,,,mcbs
3,starter j,,,unga
4,mola feeds,,,feeds


In [21]:
category_df.isna().sum()

product_name         0
sub_category     22409
Sub category     21418
category_name        0
dtype: int64

In [22]:
category_df['Sub category'].unique()

array([nan, 'Fungicide', 'Herbicide', 'Vegetable Seeds', 'Insecticide',
       'Foliar Fertilizer', 'Cereal Seeds', 'Hygiene',
       'Foliar Fertilizers', 'Powder', 'Block'], dtype=object)

In [23]:
category_df['Sub category'] = category_df['Sub category'].replace('Foliar Fertilizers', 'Foliar Fertilizer')

In [24]:
category_df.shape

(22596, 4)

In [25]:
category_df = category_df.drop('sub_category', axis = 1).\
    rename(columns={'Sub category': 'sub_category'})
category_df.head()

Unnamed: 0,product_name,sub_category,category_name
0,magazine east african,,locall
1,absolute,Fungicide,agrochemicals
2,pole double c63,,mcbs
3,starter j,,unga
4,mola feeds,,feeds


In [26]:
# category_df = category_df.drop_duplicates(subset=['product_name'], keep='last').reset_index(drop=True)
category_df[category_df.duplicated(subset=['product_name'])]

Unnamed: 0,product_name,sub_category,category_name


In [27]:
category_df.isna().sum() # ---> Old

product_name         0
sub_category     21418
category_name        0
dtype: int64

In [27]:
category_df.isna().sum() # ---> New

product_name         0
sub_category     13140
Industry         10665
category_name        0
dtype: int64

In [29]:
# agrochem_df = category_df[category_df['category_name'] == 'agrochemicals']
# agrochem_df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)

# x = agrochem_df[agrochem_df['sub_category'].isna()]
# x.to_csv('Agrochemicals_without_subcategory.xlsx', index=False)

In [28]:
category_df

Unnamed: 0,product_name,sub_category,category_name
0,magazine east african,,locall
1,absolute,Fungicide,agrochemicals
2,pole double c63,,mcbs
3,starter j,,unga
4,mola feeds,,feeds
...,...,...,...
22591,beckojat,,minerals and supplements
22592,mineral blocks 5 kg,,minerals and supplements
22593,mineral dry blocks,,minerals and supplements
22594,multivitamin aroms,,minerals and supplements


In [29]:
sub_df = pd.read_csv('/home/natasha/Documents/Iprocure/Sales-Data-Cleanup/data/subcategories.csv')
sub_df['product_name'] = sub_df['product_name'].apply(lambda x: x.lower().strip())
sub_df.head()

Unnamed: 0,product_name,sub_category
0,lancer 130 sc,Insecticide
1,nature guard 525 wdg,Fungicide
2,vanguisher 26% wdg,Fungicide
3,elglysate 480sl,Herbicide
4,alwin gold 500mls,Fungicide


In [30]:
category_df = category_df.merge(sub_df, on='product_name', how='left')
category_df.isna().sum()

product_name          0
sub_category_x    21418
category_name         0
sub_category_y    22578
dtype: int64

In [31]:
category_df

Unnamed: 0,product_name,sub_category_x,category_name,sub_category_y
0,magazine east african,,locall,
1,absolute,Fungicide,agrochemicals,
2,pole double c63,,mcbs,
3,starter j,,unga,
4,mola feeds,,feeds,
...,...,...,...,...
22591,beckojat,,minerals and supplements,
22592,mineral blocks 5 kg,,minerals and supplements,
22593,mineral dry blocks,,minerals and supplements,
22594,multivitamin aroms,,minerals and supplements,


In [32]:
category_df['sub_category_x'] = np.where(category_df['sub_category_x'].isna(), category_df['sub_category_y'], category_df['sub_category_x'])
category_df.isna().sum()

product_name          0
sub_category_x    21400
category_name         0
sub_category_y    22578
dtype: int64

In [33]:
category_df = category_df.drop('sub_category_y', axis = 1).\
    rename(columns={'sub_category_x': 'sub_category'})
category_df

Unnamed: 0,product_name,sub_category,category_name
0,magazine east african,,locall
1,absolute,Fungicide,agrochemicals
2,pole double c63,,mcbs
3,starter j,,unga
4,mola feeds,,feeds
...,...,...,...
22591,beckojat,,minerals and supplements
22592,mineral blocks 5 kg,,minerals and supplements
22593,mineral dry blocks,,minerals and supplements
22594,multivitamin aroms,,minerals and supplements


In [34]:
category_df.to_csv('clean_categories.csv', index=False)

In [40]:
category_df[category_df['category_name'].isna()]

Unnamed: 0,product_name,sub_category,Industry,category_name
