In [1]:
# importing the necessary libraries

import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
#Reading the Receipts dataset

brands_df = pd.read_json('brands.json.gz',lines=True,compression='gzip')

In [3]:
brands_df.head()

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,0.0,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,0.0,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827


In [4]:
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   _id           1167 non-null   object 
 1   barcode       1167 non-null   int64  
 2   category      1012 non-null   object 
 3   categoryCode  517 non-null    object 
 4   cpg           1167 non-null   object 
 5   name          1167 non-null   object 
 6   topBrand      555 non-null    float64
 7   brandCode     933 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 73.1+ KB


### In the brands dataset, we notice that the columns have value in the dictionary format. In order to clean the data the first step would be to transform the columns into a readable pandas dataframe to perform operations on.

In [5]:
brands_df['_id'] = brands_df['_id'].apply(lambda x: x['$oid'])

In [6]:
brands_df['cpg']

0       {'$id': {'$oid': '601ac114be37ce2ead437550'}, ...
1       {'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...
2       {'$id': {'$oid': '601ac142be37ce2ead437559'}, ...
3       {'$id': {'$oid': '601ac142be37ce2ead437559'}, ...
4       {'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...
                              ...                        
1162    {'$ref': 'Cogs', '$id': {'$oid': '5f77274dbe37...
1163    {'$ref': 'Cogs', '$id': {'$oid': '53e10d6368ab...
1164    {'$ref': 'Cogs', '$id': {'$oid': '5332fa12e4b0...
1165    {'$ref': 'Cogs', '$id': {'$oid': '5332f5f6e4b0...
1166    {'$id': {'$oid': '6026d757be37ce6369301467'}, ...
Name: cpg, Length: 1167, dtype: object

### We see that `cpg` column has two key value pair where keys are additional column names and values are the respective key column values.

In [7]:
brands_df['cpg'][2]

{'$id': {'$oid': '601ac142be37ce2ead437559'}, '$ref': 'Cogs'}

In [8]:
brands_df['cpg'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1167 entries, 0 to 1166
Series name: cpg
Non-Null Count  Dtype 
--------------  ----- 
1167 non-null   object
dtypes: object(1)
memory usage: 9.2+ KB


### Now, normalizing the dataset.

In [9]:
brands_cpg_ref = pd.json_normalize(brands_df['cpg'])
brands_cpg_ref = brands_cpg_ref.add_prefix('cpg.')

### Merging it back to the original dataset.

In [10]:
brands_df = pd.merge(brands_df, brands_cpg_ref, left_index=True, right_index=True, how='outer')


In [11]:
brands_df.drop('cpg',inplace=True,axis=1)

In [12]:
brands_df.head()

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg.$ref,cpg.$id.$oid
0,601ac115be37ce2ead437551,511111019862,Baking,BAKING,test brand @1612366101024,0.0,,Cogs,601ac114be37ce2ead437550
1,601c5460be37ce2ead43755f,511111519928,Beverages,BEVERAGES,Starbucks,0.0,STARBUCKS,Cogs,5332f5fbe4b03c9a25efd0ba
2,601ac142be37ce2ead43755d,511111819905,Baking,BAKING,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176,Cogs,601ac142be37ce2ead437559
3,601ac142be37ce2ead43755a,511111519874,Baking,BAKING,test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051,Cogs,601ac142be37ce2ead437559
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,CANDY_AND_SWEETS,test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827,Cogs,5332fa12e4b03c9a25efd1e7


In [13]:
brands_df[brands_df.duplicated()]

Unnamed: 0,_id,barcode,category,categoryCode,name,topBrand,brandCode,cpg.$ref,cpg.$id.$oid


In [14]:
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   _id           1167 non-null   object 
 1   barcode       1167 non-null   int64  
 2   category      1012 non-null   object 
 3   categoryCode  517 non-null    object 
 4   name          1167 non-null   object 
 5   topBrand      555 non-null    float64
 6   brandCode     933 non-null    object 
 7   cpg.$ref      1167 non-null   object 
 8   cpg.$id.$oid  1167 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 82.2+ KB


In [15]:
brands_df['category'].value_counts()

Baking                         369
Beer Wine Spirits               90
Snacks                          75
Candy & Sweets                  71
Beverages                       63
Magazines                       44
Health & Wellness               44
Breakfast & Cereal              40
Grocery                         39
Dairy                           33
Condiments & Sauces             27
Frozen                          24
Personal Care                   20
Baby                            18
Canned Goods & Soups            12
Beauty                           9
Cleaning & Home Improvement      6
Deli                             6
Beauty & Personal Care           6
Household                        5
Bread & Bakery                   5
Dairy & Refrigerated             5
Outdoor                          1
Name: category, dtype: int64

In [16]:
brands_df['categoryCode'].value_counts()

BAKING                           359
CANDY_AND_SWEETS                  71
BEER_WINE_SPIRITS                 31
HEALTHY_AND_WELLNESS              14
GROCERY                           11
BABY                               7
CLEANING_AND_HOME_IMPROVEMENT      6
BREAD_AND_BAKERY                   5
DAIRY_AND_REFRIGERATED             5
PERSONAL_CARE                      4
BEVERAGES                          1
OUTDOOR                            1
MAGAZINES                          1
FROZEN                             1
Name: categoryCode, dtype: int64

In [17]:
brands_df.drop('categoryCode',axis = 1, inplace=True)

In [18]:
brands_df.head()

Unnamed: 0,_id,barcode,category,name,topBrand,brandCode,cpg.$ref,cpg.$id.$oid
0,601ac115be37ce2ead437551,511111019862,Baking,test brand @1612366101024,0.0,,Cogs,601ac114be37ce2ead437550
1,601c5460be37ce2ead43755f,511111519928,Beverages,Starbucks,0.0,STARBUCKS,Cogs,5332f5fbe4b03c9a25efd0ba
2,601ac142be37ce2ead43755d,511111819905,Baking,test brand @1612366146176,0.0,TEST BRANDCODE @1612366146176,Cogs,601ac142be37ce2ead437559
3,601ac142be37ce2ead43755a,511111519874,Baking,test brand @1612366146051,0.0,TEST BRANDCODE @1612366146051,Cogs,601ac142be37ce2ead437559
4,601ac142be37ce2ead43755e,511111319917,Candy & Sweets,test brand @1612366146827,0.0,TEST BRANDCODE @1612366146827,Cogs,5332fa12e4b03c9a25efd1e7


In [19]:
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   _id           1167 non-null   object 
 1   barcode       1167 non-null   int64  
 2   category      1012 non-null   object 
 3   name          1167 non-null   object 
 4   topBrand      555 non-null    float64
 5   brandCode     933 non-null    object 
 6   cpg.$ref      1167 non-null   object 
 7   cpg.$id.$oid  1167 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 73.1+ KB


In [20]:
brands_df.topBrand.value_counts()

0.0    524
1.0     31
Name: topBrand, dtype: int64

In [22]:
brands_df[brands_df['brandCode']=="CHESTER'S"]

Unnamed: 0,_id,barcode,category,name,topBrand,brandCode,cpg.$ref,cpg.$id.$oid
109,585a9645e4b03e62d1ce0e79,511111801757,Snacks,Chester's,1.0,CHESTER'S,Cogs,5332f5fbe4b03c9a25efd0ba


In [23]:
brands_df[brands_df['topBrand'].isna()].head(50)

Unnamed: 0,_id,barcode,category,name,topBrand,brandCode,cpg.$ref,cpg.$id.$oid
7,5cdad0f5166eb33eb7ce0faa,511111104810,Condiments & Sauces,J.L. Kraft,,J.L. KRAFT,Cogs,559c2234e4b06aca36af13c6
9,5c408e8bcd244a1fdb47aee7,511111504788,Baking,test,,TEST,Cogs,59ba6f1ce4b092b29c167346
10,5f4bf556be37ce0b4491554d,511111516354,Baking,test brand @1598813526777,,TEST BRANDCODE @1598813526777,Cogs,5f4bf556be37ce0b44915549
11,57c08106e4b0718ff5fcb02c,511111102540,,MorningStar,,,Cpgs,5332f5f2e4b03c9a25efd0aa
13,5d6413156d5f3b23d1bc790a,511111205012,Magazines,Entertainment Weekly,,511111205012,Cogs,5d5d4fd16d5f3b23d1bc7905
17,5f358338be37ce443bf9d55a,511111515319,Baking,test brand @1597342520277,,TEST BRANDCODE @1597342520277,Cogs,5f358338be37ce443bf9d557
20,5c4699f387ff3577e203ea29,511111305125,Baby,Chris Image Test,,CHRISIMAGE,Cogs,55b62995e4b0d8e685c14213
21,5da6071ea60b87376833e34d,511111005650,Health & Wellness,Alka-Seltzer®,,ALKA SELTZER,Cogs,5d9b4f591dda2c6225a284aa
23,5332f5fee4b03c9a25efd0bd,511111303947,,Bottled Starbucks,,,Cpgs,53e10d6368abd3c7065097cc
24,5332fa7ce4b03c9a25efd22e,511111802914,,Full Throttle,,,Cpgs,5332f5ebe4b03c9a25efd0a8


In [24]:
brands_df['name'].value_counts()

Huggies                      2
V8 Hydrate                   2
Pull-Ups                     2
Dippin Dots® Cereal          2
Diabetic Living Magazine     2
                            ..
Claritin® KIDS               1
Athenos                      1
test brand @1599159969028    1
test brand @1597350074404    1
test brand @1613158231643    1
Name: name, Length: 1156, dtype: int64

In [25]:
brands_df.to_csv('brands_final.csv')