# EDA Part -1 :

This notebook contains basic exploration around the `Mercari Dataset`. 

In [1]:
# import essentials
import pandas as pd
import numpy as np

## Load the dataset:

In [2]:
# URL where data is dumped

gcp_url = 'https://storage.googleapis.com/price-alchemy/Mercari%20price%20suggestion%20challenge/train.tsv'

df = pd.read_csv(gcp_url, sep='\t', header=0)

What does our dataset look like?

In [3]:
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
df.shape

(1482535, 8)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482531 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [6]:
# Randomly selecting 10% of categories and brands to hold out with a fixed random state
import random

# Set the random seed
random.seed(42)

# Select categories
unique_categories = df['category_name'].unique()
categories_to_hold_out = random.sample(list(unique_categories), int(0.1 * len(unique_categories)))

# Select brands
unique_brands = df['brand_name'].unique()
brands_to_hold_out = random.sample(list(unique_brands), int(0.1 * len(unique_brands)))

In [7]:
len(categories_to_hold_out)

128

In [8]:
len(brands_to_hold_out)

481

In [9]:
# Filter records to be held out
df_holdout = df[(df['category_name'].isin(categories_to_hold_out)) | (df['brand_name'].isin(brands_to_hold_out))]

df_holdout

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...
14,14,HOLD for Dogs2016 Minnetonka boots,3,Women/Shoes/Boots,UGG Australia,43.0,0,Authentic. Suede fringe boots. Great condition...
18,18,"Too Faced Limited ""Merry Macaroons""",1,Beauty/Makeup/Makeup Palettes,Too Faced,25.0,1,This AUTHENTIC pallete by Too Faced is brand n...
24,24,Black and Red Baseball Tee,2,Women/Tops & Blouses/T-Shirts,FOREVER 21,10.0,0,lanascloset ~~~ description: never worn! ✨ i d...
...,...,...,...,...,...,...,...,...
1482487,1482487,Blazer bundle,2,Women/Suits & Blazers/Blazer,FOREVER 21,31.0,0,"White blazer- forever 21 size medium, fits sma..."
1482505,1482505,NorthFace rain jacket!,3,Women/Coats & Jackets/Raincoat,The North Face,34.0,0,great condition! a few signs of wear on the in...
1482506,1482506,KitchenAid 4.5 quart mixer,1,Home/Home Appliances/Kitchen Appliances,KitchenAid,165.0,0,Brand new sealed in box kitchen aid kitchenaid...
1482512,1482512,Lululemon,2,Women/Athletic Apparel/Shirts & Tops,Lululemon,34.0,0,NWOT - Blue - size 8


In [10]:
# Remove held-out records from the original DataFrame
df_train = df[~df.index.isin(df_holdout.index)]

df_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
...,...,...,...,...,...,...,...,...
1482529,1482529,Men's UA [rm],2,Men/Athletic Apparel/Shirts & Tops,Under Armour,34.0,0,[rm] for the set both in perfect condition no ...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...


In [11]:
# Randomly sample 500K records from df_train
holdout_data = df_train.sample(n=300000, random_state=42)

holdout_data

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
58022,58022,Vs temptation perfume new,1,Beauty/Fragrance/Women,Victoria's Secret,14.0,0,Vs temptation perfume 8.4oz new
1472596,1472596,Kylie Cosmetics Liquid Lipstick Koko K,2,Beauty/Makeup/Lips,Kylie Cosmetics,16.0,0,Not used No box No lip Liner Price Firm
173387,173387,TC LULAROE LEGGINGS,2,Women/Jeans/Leggings,LuLaRoe,12.0,0,Brand new never worn! Please check out my othe...
612079,612079,Volcom Shorts,2,Women/Athletic Apparel/Shorts,Volcom,10.0,0,"Size 00 never worn, bought at pacsun"
149163,149163,Enfamil Formula Coupons,1,Kids/Feeding/Bottle-Feeding,Enfamil,7.0,1,[rm] value for formula 1- expires February 28 ...
...,...,...,...,...,...,...,...,...
956296,956296,Vintage Folding Fan Necklace & Ring,3,Women/Jewelry/Necklaces,,9.0,0,Vintage Chinese Folding fan necklace plus free...
200239,200239,Toddler socks,3,Kids/Boys 2T-5T/Other,,10.0,0,These are toddler socks in a ok condition
496175,496175,Tan Shorts,2,Women/Athletic Apparel/Shorts,Merona,13.0,0,"Beige, tan, khaki shorts. Comfy. Worn once."
97648,97648,Unlocked iPhone 6,3,Electronics/Cell Phones & Accessories/Cell Pho...,Apple,250.0,1,White unlocked iPhone 6. Great condition. No c...


In [12]:
# Remove sampled records from df_train
df_train = df_train[~df_train.index.isin(holdout_data.index)]
df_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.
...,...,...,...,...,...,...,...,...
1482527,1482527,Blk/white ribbed mock neck bodysuit M,1,Women/Tops & Blouses/Blouse,,10.0,1,Brand new black and white ribbed mock neck bod...
1482528,1482528,Victoria's Secret Tankini Sz. Large,2,Women/Athletic Apparel/Sports Bras,Victoria's Secret,18.0,1,Purple and Paisley Victoria's Secret Tankini S...
1482529,1482529,Men's UA [rm],2,Men/Athletic Apparel/Shirts & Tops,Under Armour,34.0,0,[rm] for the set both in perfect condition no ...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."


In [13]:
# Add sampled records to df_holdout
df_holdout = pd.concat([df_holdout, holdout_data], ignore_index=True)
df_holdout

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...
1,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...
2,14,HOLD for Dogs2016 Minnetonka boots,3,Women/Shoes/Boots,UGG Australia,43.0,0,Authentic. Suede fringe boots. Great condition...
3,18,"Too Faced Limited ""Merry Macaroons""",1,Beauty/Makeup/Makeup Palettes,Too Faced,25.0,1,This AUTHENTIC pallete by Too Faced is brand n...
4,24,Black and Red Baseball Tee,2,Women/Tops & Blouses/T-Shirts,FOREVER 21,10.0,0,lanascloset ~~~ description: never worn! ✨ i d...
...,...,...,...,...,...,...,...,...
510124,956296,Vintage Folding Fan Necklace & Ring,3,Women/Jewelry/Necklaces,,9.0,0,Vintage Chinese Folding fan necklace plus free...
510125,200239,Toddler socks,3,Kids/Boys 2T-5T/Other,,10.0,0,These are toddler socks in a ok condition
510126,496175,Tan Shorts,2,Women/Athletic Apparel/Shorts,Merona,13.0,0,"Beige, tan, khaki shorts. Comfy. Worn once."
510127,97648,Unlocked iPhone 6,3,Electronics/Cell Phones & Accessories/Cell Pho...,Apple,250.0,1,White unlocked iPhone 6. Great condition. No c...


**Final training dataset**

In [14]:
# Shuffle the rows
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

# Generate random timestamps for created_at between 2022-01-01 and 2024-03-05
start_date = pd.to_datetime('2022-01-01')
end_date = pd.to_datetime('2024-03-05')

df_train['created_at'] = np.random.choice(pd.date_range(start=start_date, end=end_date, freq='T'), len(df_train))
df_train['last_updated_at'] = df_train['created_at']

# Sort the DataFrame based on created_at timestamp in ascending order
df_train = df_train.sort_values(by='created_at').reset_index(drop=True)

df_train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,created_at,last_updated_at
0,793697,Plaid Vest,2,Women/Coats & Jackets/Vest,Old Navy,11.0,1,Green and blue. Very thick and soft! Perfect f...,2022-01-01 00:00:00,2022-01-01 00:00:00
1,402094,Women's Sperrys,3,Women/Shoes/Loafers & Slip-Ons,Sperrys,21.0,0,EUC,2022-01-01 00:01:00,2022-01-01 00:01:00
2,522439,Grey sweater dress,1,Women/Dresses/Other,Fashion Nova,20.0,1,This is a heather grey sweater dress from fash...,2022-01-01 00:01:00,2022-01-01 00:01:00
3,214455,Tory Burch 'Perry' Leather Wallet,3,Women/Women's Accessories/Wallets,Tory Burch,91.0,0,Tory Burch 'Perry' Leather Zip Continental Wal...,2022-01-01 00:03:00,2022-01-01 00:03:00
4,902755,Fujifilm Rainbow Instax Film,1,Electronics/Cameras & Photography/Film Photogr...,Fuji,14.0,0,No description yet,2022-01-01 00:05:00,2022-01-01 00:05:00
...,...,...,...,...,...,...,...,...,...,...
972401,700766,LF Floral Hooded Flannel,3,Women/Sweaters/Hooded,LF,154.0,0,LF Floral Hooded Flannel. One Size. EUC - blue...,2024-03-04 23:53:00,2024-03-04 23:53:00
972402,1277823,Rae Dunn FAMILY & FEAST Mugs,1,Home/Kitchen & Dining/Coffee & Tea Accessories,,24.0,0,This is the exact item you will receive. I fin...,2024-03-04 23:57:00,2024-03-04 23:57:00
972403,1446057,‼️LAST CHANCE TO GET‼️,3,"Women/Dresses/Above Knee, Mini",Brandy Melville,24.0,0,Cream and grey floral dress with low cut back,2024-03-04 23:57:00,2024-03-04 23:57:00
972404,222392,⭐️OPI - Race Red⭐️,1,Beauty/Makeup/Nails,,7.0,1,❤️Brand New • Full Size • Authentic❤️ ✨In this...,2024-03-04 23:58:00,2024-03-04 23:58:00


In [16]:
#!pip install google-cloud-storage

In [15]:
from google.cloud import storage

path_to_private_key = '/home/bishal/mlops/project/Price_Alchemy/ringed-reserve-416823-b54bbb8174c1.json'
client = storage.Client.from_service_account_json(json_credentials_path=path_to_private_key)

In [16]:
#df_train.to_csv("df_train.csv")

# The bucket on GCS in which to write the CSV file
bucket = client.bucket('price_alchemy')
# The name assigned to the CSV file on GCS
blob = bucket.blob('Data/data.csv')
blob.upload_from_string(df_train.to_csv(index=False), 'text/csv')

**Final holdout dataset**

In [17]:
# Shuffle the rows
df_holdout = df_holdout.sample(frac=1, random_state=42).reset_index(drop=True)
df_holdout.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,530625,Kate Spade Sunglasses,2,Women/Women's Accessories/Sunglasses,Kate Spade,40.0,0,Kate Spade Cat Eye Sunglasses. Color black wit...
1,876172,Faux piercings ⚡️,1,Handmade/Jewelry/Jewelry,,4.0,1,Faux Nose Rings!! •Now you can look stylish wi...
2,919501,Women's Columbia Jacket,2,Women/Athletic Apparel/Jackets,Columbia,30.0,0,"Women's black, medium Columbia Jacket. Excelle..."
3,820303,Doctor Strange #166,3,Vintage & Collectibles/Book/Comics,,10.0,1,No description yet
4,186019,iPhone 5s waterproof case,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,Blue iPhone 5s/5c/5 case good condition. Teste...


In [52]:
df_train.shape

(972406, 10)

In [63]:
import csv
import math

df_train_dump = df_train.copy()

batch_size = 200000
total_rows = len(df_train_dump)

# The bucket on GCS in which to write the CSV files
bucket = client.bucket('price_alchemy')

# Calculate the number of batches
num_batches = math.ceil(total_rows / batch_size)
num_batches

5

In [64]:
# Add a new column 'index_column' starting from 1
df_train_dump.insert(0, 'id', range(1, len(df_train_dump) + 1))
df_train_dump.head()

Unnamed: 0,id,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,created_at,last_updated_at
0,1,793697,Plaid Vest,2,Women/Coats & Jackets/Vest,Old Navy,11.0,1,Green and blue. Very thick and soft! Perfect f...,2022-01-01 00:00:00,2022-01-01 00:00:00
1,2,402094,Women's Sperrys,3,Women/Shoes/Loafers & Slip-Ons,Sperrys,21.0,0,EUC,2022-01-01 00:01:00,2022-01-01 00:01:00
2,3,522439,Grey sweater dress,1,Women/Dresses/Other,Fashion Nova,20.0,1,This is a heather grey sweater dress from fash...,2022-01-01 00:01:00,2022-01-01 00:01:00
3,4,214455,Tory Burch 'Perry' Leather Wallet,3,Women/Women's Accessories/Wallets,Tory Burch,91.0,0,Tory Burch 'Perry' Leather Zip Continental Wal...,2022-01-01 00:03:00,2022-01-01 00:03:00
4,5,902755,Fujifilm Rainbow Instax Film,1,Electronics/Cameras & Photography/Film Photogr...,Fuji,14.0,0,No description yet,2022-01-01 00:05:00,2022-01-01 00:05:00


In [65]:
df_train_dump.tail()

Unnamed: 0,id,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,created_at,last_updated_at
972401,972402,700766,LF Floral Hooded Flannel,3,Women/Sweaters/Hooded,LF,154.0,0,LF Floral Hooded Flannel. One Size. EUC - blue...,2024-03-04 23:53:00,2024-03-04 23:53:00
972402,972403,1277823,Rae Dunn FAMILY & FEAST Mugs,1,Home/Kitchen & Dining/Coffee & Tea Accessories,,24.0,0,This is the exact item you will receive. I fin...,2024-03-04 23:57:00,2024-03-04 23:57:00
972403,972404,1446057,‼️LAST CHANCE TO GET‼️,3,"Women/Dresses/Above Knee, Mini",Brandy Melville,24.0,0,Cream and grey floral dress with low cut back,2024-03-04 23:57:00,2024-03-04 23:57:00
972404,972405,222392,⭐️OPI - Race Red⭐️,1,Beauty/Makeup/Nails,,7.0,1,❤️Brand New • Full Size • Authentic❤️ ✨In this...,2024-03-04 23:58:00,2024-03-04 23:58:00
972405,972406,151885,Boho Cardigan Vest,2,Women/Sweaters/Cardigan,,12.0,0,This cardigan is beautiful and can make any ou...,2024-03-04 23:58:00,2024-03-04 23:58:00


In [66]:
for batch_num in range(1, num_batches + 1):
    # Calculate start and end indices for the current batch
    start_index = (batch_num - 1) * batch_size
    end_index = min(batch_num * batch_size, total_rows)

    # Create a DataFrame slice for the current batch
    df_batch = df_train_dump.iloc[start_index:end_index].copy()

    # The name assigned to the CSV file on GCS
    blob = bucket.blob(f'data_dump_sql/data_{batch_num}.csv')
    
    # Upload the current batch to GCS
    blob.upload_from_string(df_batch.to_csv(header=False, index=False), 'text/csv')

In [None]:
# df_train_dump = df_train.copy()

# # Add a new column 'index_column' starting from 1
# df_train_dump.insert(0, 'id', range(1, len(df_test) + 1))

# # The bucket on GCS in which to write the CSV file
# bucket = client.bucket('price_alchemy')
# # The name assigned to the CSV file on GCS
# blob = bucket.blob('Data/data_1.csv')
# blob.upload_from_string(df_train_dump.to_csv(header=False, index=False), 'text/csv')

In [59]:
# blob = bucket.blob('Data/holdout_data.csv')
# blob.upload_from_string(df_holdout.to_csv(index=False), 'text/csv')

In [60]:
# df_test = df_train.head()
# df_test.dtypes

In [61]:
# import csv

# df_test = df_train.head(100)

# # Add a new column 'index_column' starting from 1
# df_test.insert(0, 'id', range(1, len(df_test) + 1))

# # The bucket on GCS in which to write the CSV file
# bucket = client.bucket('price_alchemy')
# # The name assigned to the CSV file on GCS
# blob = bucket.blob('Data/test_data_gcp.csv')
# blob.upload_from_string(df_test.to_csv(header=False, index=False, quoting=csv.QUOTE_MINIMAL), 'text/csv')


**Holdout set primary key add**

In [68]:
# URL where data is dumped

holdout_url = 'https://storage.googleapis.com/price_alchemy/Data/holdout_data.csv'

df_holdout = pd.read_csv(holdout_url, header=0)
df_holdout

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,530625,Kate Spade Sunglasses,2,Women/Women's Accessories/Sunglasses,Kate Spade,40.0,0,Kate Spade Cat Eye Sunglasses. Color black wit...
1,876172,Faux piercings ⚡️,1,Handmade/Jewelry/Jewelry,,4.0,1,Faux Nose Rings!! •Now you can look stylish wi...
2,919501,Women's Columbia Jacket,2,Women/Athletic Apparel/Jackets,Columbia,30.0,0,"Women's black, medium Columbia Jacket. Excelle..."
3,820303,Doctor Strange #166,3,Vintage & Collectibles/Book/Comics,,10.0,1,No description yet
4,186019,iPhone 5s waterproof case,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,Blue iPhone 5s/5c/5 case good condition. Teste...
...,...,...,...,...,...,...,...,...
510124,55846,Black Heels,2,Women/Shoes/Pumps,,15.0,1,Only worn once !
510125,821700,Elementary & Intermediate Algebra 6th Ed,3,Other/Books/Education & Teaching,,25.0,1,College textbook with zero to minimal writing ...
510126,931820,Lululemon Kung Fu Pants Size M,3,Men/Athletic Apparel/Pants,Lululemon,50.0,1,Size M regular Great condition! Only worn a co...
510127,1036984,Acacia Heliconia Kekaha Top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,60.0,0,"Size medium , few fuzzies but perfect conditio..."


In [70]:
# Add a new column 'index_column' starting from 1
df_holdout.insert(0, 'id', range(1, len(df_holdout) + 1))
df_holdout.head()

Unnamed: 0,id,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,1,530625,Kate Spade Sunglasses,2,Women/Women's Accessories/Sunglasses,Kate Spade,40.0,0,Kate Spade Cat Eye Sunglasses. Color black wit...
1,2,876172,Faux piercings ⚡️,1,Handmade/Jewelry/Jewelry,,4.0,1,Faux Nose Rings!! •Now you can look stylish wi...
2,3,919501,Women's Columbia Jacket,2,Women/Athletic Apparel/Jackets,Columbia,30.0,0,"Women's black, medium Columbia Jacket. Excelle..."
3,4,820303,Doctor Strange #166,3,Vintage & Collectibles/Book/Comics,,10.0,1,No description yet
4,5,186019,iPhone 5s waterproof case,3,"Electronics/Cell Phones & Accessories/Cases, C...",,10.0,1,Blue iPhone 5s/5c/5 case good condition. Teste...


In [71]:
    # The name assigned to the CSV file on GCS
    blob = bucket.blob('data_dump_sql/holdout_data.csv')
    
    # Upload the current batch to GCS
    blob.upload_from_string(df_holdout.to_csv(header=False, index=False), 'text/csv')