# EDA Part -1 :

This notebook contains basic exploration around the `Mercari Dataset`. 

In [1]:
# import essentials
import pandas as pd
import numpy as np

## Load the dataset:

In [2]:
# URL where data is dumped

gcp_url = 'https://storage.googleapis.com/price-alchemy/Mercari%20price%20suggestion%20challenge/train.tsv'

df = pd.read_csv(gcp_url, sep='\t', header=0)

What does our dataset look like?

In [3]:
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
df.shape

(1482535, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482531 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [6]:
# Randomly selecting 10% of categories and brands to hold out with a fixed random state
import random

# Set the random seed
random.seed(42)

# Select categories
unique_categories = df['category_name'].unique()
categories_to_hold_out = random.sample(list(unique_categories), int(0.1 * len(unique_categories)))

# Select brands
unique_brands = df['brand_name'].unique()
brands_to_hold_out = random.sample(list(unique_brands), int(0.1 * len(unique_brands)))

In [7]:
len(categories_to_hold_out)

128

In [8]:
len(brands_to_hold_out)

481

In [9]:
# Filter records to be held out
df_holdout = df[(df['category_name'].isin(categories_to_hold_out)) | (df['brand_name'].isin(brands_to_hold_out))]

df_holdout

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...
14,14,HOLD for Dogs2016 Minnetonka boots,3,Women/Shoes/Boots,UGG Australia,43.0,0,Authentic. Suede fringe boots. Great condition...
18,18,"Too Faced Limited ""Merry Macaroons""",1,Beauty/Makeup/Makeup Palettes,Too Faced,25.0,1,This AUTHENTIC pallete by Too Faced is brand n...
24,24,Black and Red Baseball Tee,2,Women/Tops & Blouses/T-Shirts,FOREVER 21,10.0,0,lanascloset ~~~ description: never worn! ✨ i d...
...,...,...,...,...,...,...,...,...
1482487,1482487,Blazer bundle,2,Women/Suits & Blazers/Blazer,FOREVER 21,31.0,0,"White blazer- forever 21 size medium, fits sma..."
1482505,1482505,NorthFace rain jacket!,3,Women/Coats & Jackets/Raincoat,The North Face,34.0,0,great condition! a few signs of wear on the in...
1482506,1482506,KitchenAid 4.5 quart mixer,1,Home/Home Appliances/Kitchen Appliances,KitchenAid,165.0,0,Brand new sealed in box kitchen aid kitchenaid...
1482512,1482512,Lululemon,2,Women/Athletic Apparel/Shirts & Tops,Lululemon,34.0,0,NWOT - Blue - size 8


In [10]:
# Remove held-out records from the original DataFrame
df_train = df[~df.index.isin(df_holdout.index)]

df_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
...,...,...,...,...,...,...,...,...
1482529,1482529,Men's UA [rm],2,Men/Athletic Apparel/Shirts & Tops,Under Armour,34.0,0,[rm] for the set both in perfect condition no ...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...


In [11]:
# Randomly sample 500K records from df_train
holdout_data = df_train.sample(n=500000, random_state=42)

holdout_data

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
58022,58022,Vs temptation perfume new,1,Beauty/Fragrance/Women,Victoria's Secret,14.0,0,Vs temptation perfume 8.4oz new
1472596,1472596,Kylie Cosmetics Liquid Lipstick Koko K,2,Beauty/Makeup/Lips,Kylie Cosmetics,16.0,0,Not used No box No lip Liner Price Firm
173387,173387,TC LULAROE LEGGINGS,2,Women/Jeans/Leggings,LuLaRoe,12.0,0,Brand new never worn! Please check out my othe...
612079,612079,Volcom Shorts,2,Women/Athletic Apparel/Shorts,Volcom,10.0,0,"Size 00 never worn, bought at pacsun"
149163,149163,Enfamil Formula Coupons,1,Kids/Feeding/Bottle-Feeding,Enfamil,7.0,1,[rm] value for formula 1- expires February 28 ...
...,...,...,...,...,...,...,...,...
1109123,1109123,"Size 6 ""Cassie"" heels ON HOLD",3,Women/Shoes/Pumps,Charlotte Russe,23.0,0,Worn once to prom & have been collecting dust ...
498583,498583,For one NES Classic Edition FREE SHIPPIN,1,Vintage & Collectibles/Electronics/Video Game,Nintendo,180.0,1,For one NES Classic Edition In Hand Ready to s...
1097977,1097977,Ring sz 5,1,Women/Jewelry/Rings,,14.0,0,Stainless steel ring new with box sz 5
558119,558119,"NYX Ombre Lip Duo, Hearts & Spades",1,Beauty/Makeup/Lips,NYX,6.0,1,"Brand new, never used. Bundle up for free ship..."


In [12]:
# Remove sampled records from df_train
df_train = df_train[~df_train.index.isin(holdout_data.index)]
df_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.
10,10,Smashbox primer,2,Beauty/Makeup/Face,Smashbox,8.0,1,0.25 oz Full size is 1oz for [rm] in Sephora
11,11,New vs pi k body mists,1,Beauty/Fragrance/Women,Victoria's Secret,34.0,0,(5) new vs pink body mists (2.5 oz each) Fresh...
...,...,...,...,...,...,...,...,...
1482527,1482527,Blk/white ribbed mock neck bodysuit M,1,Women/Tops & Blouses/Blouse,,10.0,1,Brand new black and white ribbed mock neck bod...
1482528,1482528,Victoria's Secret Tankini Sz. Large,2,Women/Athletic Apparel/Sports Bras,Victoria's Secret,18.0,1,Purple and Paisley Victoria's Secret Tankini S...
1482529,1482529,Men's UA [rm],2,Men/Athletic Apparel/Shirts & Tops,Under Armour,34.0,0,[rm] for the set both in perfect condition no ...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."


In [13]:
# Add sampled records to df_holdout
df_holdout = pd.concat([df_holdout, holdout_data], ignore_index=True)
df_holdout

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...
1,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...
2,14,HOLD for Dogs2016 Minnetonka boots,3,Women/Shoes/Boots,UGG Australia,43.0,0,Authentic. Suede fringe boots. Great condition...
3,18,"Too Faced Limited ""Merry Macaroons""",1,Beauty/Makeup/Makeup Palettes,Too Faced,25.0,1,This AUTHENTIC pallete by Too Faced is brand n...
4,24,Black and Red Baseball Tee,2,Women/Tops & Blouses/T-Shirts,FOREVER 21,10.0,0,lanascloset ~~~ description: never worn! ✨ i d...
...,...,...,...,...,...,...,...,...
710124,1109123,"Size 6 ""Cassie"" heels ON HOLD",3,Women/Shoes/Pumps,Charlotte Russe,23.0,0,Worn once to prom & have been collecting dust ...
710125,498583,For one NES Classic Edition FREE SHIPPIN,1,Vintage & Collectibles/Electronics/Video Game,Nintendo,180.0,1,For one NES Classic Edition In Hand Ready to s...
710126,1097977,Ring sz 5,1,Women/Jewelry/Rings,,14.0,0,Stainless steel ring new with box sz 5
710127,558119,"NYX Ombre Lip Duo, Hearts & Spades",1,Beauty/Makeup/Lips,NYX,6.0,1,"Brand new, never used. Bundle up for free ship..."


**Final training dataset**

In [14]:
# Shuffle the rows
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

# Generate random timestamps for created_at between 2022-01-01 and 2024-03-05
start_date = pd.to_datetime('2022-01-01')
end_date = pd.to_datetime('2024-03-05')

df_train['created_at'] = np.random.choice(pd.date_range(start=start_date, end=end_date, freq='T'), len(df_train))
df_train['last_updated_at'] = df_train['created_at']

# Sort the DataFrame based on created_at timestamp in ascending order
df_train = df_train.sort_values(by='created_at').reset_index(drop=True)

df_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,created_at,last_updated_at
0,152940,Bundle for Danielle Liu,1,Other/Office supplies/Shipping Supplies,,10.0,1,Brand new (5) Mini Rose Polymailers Lightweigh...,2022-01-01 00:00:00,2022-01-01 00:00:00
1,427984,Reserved for Mairiah. Cross Strap,1,Women/Athletic Apparel/Sports Bras,,8.0,1,Thanks for your purchase!!!!!,2022-01-01 00:01:00,2022-01-01 00:01:00
2,612245,Light Blue Maternity t-shirt,3,Women/Maternity/Tops & Blouses,Liz Lange,7.0,0,Light blue Liz Lange Maternity t-shirt. Size l...,2022-01-01 00:02:00,2022-01-01 00:02:00
3,812337,Bundle,1,Beauty/Bath & Body/Bath,,36.0,1,Disney princess and frog body glitter spray in...,2022-01-01 00:03:00,2022-01-01 00:03:00
4,153659,Dvd bundle set,3,Electronics/Media/DVD,,10.0,1,Dvd rob zombie 3 disc collector's set Dogma Su...,2022-01-01 00:04:00,2022-01-01 00:04:00
...,...,...,...,...,...,...,...,...,...,...
772401,1293239,Bath & Body Works NIB Gift Set FREESHIP!,1,Beauty/Bath & Body/Sets,Bath & Body Works,18.0,1,"NIB, never used or opened bottles. Comes with ...",2024-03-04 23:52:00,2024-03-04 23:52:00
772402,256914,Grinder,1,Home/Kitchen & Dining/Kitchen Utensils & Gadgets,,10.0,1,Awesome limited edition Gold 2.25 inch wide Fi...,2024-03-04 23:54:00,2024-03-04 23:54:00
772403,1298466,American Girl Costume,3,Kids/Toys/Dolls & Accessories,,12.0,1,Great for Halloween! In VGUC! Not AG brand. No...,2024-03-04 23:54:00,2024-03-04 23:54:00
772404,1065142,Adidas Track Jacket Size Small,3,Women/Athletic Apparel/Jackets,Adidas,24.0,0,Adidas track jacket worn 1-2 times 100% polyes...,2024-03-04 23:55:00,2024-03-04 23:55:00


In [16]:
#!pip install google-cloud-storage

In [27]:
from google.cloud import storage

path_to_private_key = '/home/bishal/mlops/project/Price_Alchemy/ringed-reserve-416823-b54bbb8174c1.json'
client = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)

In [32]:
#df_train.to_csv("df_train.csv")

# The bucket on GCS in which to write the CSV file
bucket = client.bucket('price_alchemy')
# The name assigned to the CSV file on GCS
blob = bucket.blob('Data/df_train.csv')
blob.upload_from_string(df_train.to_csv(), 'text/csv')

**Final holdout dataset**

In [33]:
# Shuffle the rows
df_holdout = df_holdout.sample(frac=1, random_state=42).reset_index(drop=True)
df_holdout.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,223519,Old Navy Exercise/Yoga Mat,4,Sports & Outdoors/Exercise/Yoga & Pilates,,11.0,0,Old Navy Exercise/Yoga Mat One end has water mark
1,647935,Rainbow Brite Hallmark Itty Bittys New,1,Kids/Toys/Stuffed Animals & Plush,,18.0,0,Brand new with tags Rainbow Brite and Sprite
2,586793,Bundle reserved,3,Women/Tops & Blouses/T-Shirts,FOREVER 21,16.0,0,No description yet
3,194841,Introducing The Original Club!,2,Other/Automotive/Tools & Equipment,,26.0,0,The Original Club. Made of SOLID STEEL! Time t...
4,269355,Plants vs Zombies Garden Warfare PS3,3,Electronics/Video Games & Consoles/Games,Sony,9.0,0,PS3 Game. Good condition. Blu-ray Disc. Case a...


In [34]:
blob = bucket.blob('Data/df_holdout.csv')
blob.upload_from_string(df_holdout.to_csv(), 'text/csv')