<a href="https://colab.research.google.com/github/manju1201/Flipkart_Data_Classification_using_Description/blob/main/2_Cleaning_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports



In [1]:
import numpy as np 
import pandas as pd 
import re 
import nltk 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

from gensim.models.fasttext import FastText
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Loading Data

In [2]:
data = pd.read_csv('/content/drive/MyDrive/MIDAS/1_Post_Data_Analysis_flipkart_com-ecommerce_sample.csv')

Func to preprocess the text

In [3]:
stemmer = WordNetLemmatizer()
def preprocess_string(text):
    text = re.sub(r'\W', ' ', str(text)) #conversion of datatype to string
    text=re.sub('[^a-z\s]+',' ',text,flags=re.IGNORECASE) #every char except alphabets is replaced
    text=re.sub('(\s+)',' ',text) #multiple spaces are replaced by single space
    text=text.lower() #converting the cleaned string to lower case
    tokens = text.split() # splitting based on space
    tokens = [stemmer.lemmatize(word) for word in tokens] # lemmatization(any form into root word) of words
    tokens = [word for word in tokens if word not in en_stop] # removing Stopwords
    tokens = [word for word in tokens if len(word) > 2] # considering words of length greater than 2
    preprocessed_text = ' '.join(tokens)  # joining all the processed tokens into sentence
    return preprocessed_text # returns the sentence

In [4]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,crawl_year,Month,cat_level_1,cat_level_2,cat_level_3,cat_level_4,cat_level_5,cat_level_6,discounted_percentage
0,0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23+00:00,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",2016,3,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts,Alisha Shorts,"Alisha Solid Women's Cycling Shorts""",62.1


# Dropping the null values and preprocessing the description column 

In [5]:
data['description'].isnull().sum() # sum of null values in 'description' column

2

In [6]:
data = data.dropna(subset=['description']) # Dropping the null values of decription rows from the dataset

In [7]:
data['description'] = data['description'].apply(preprocess_string) # applying the preprocess_string function on description 

In [8]:
data['description'] # seeing if the preprocessing is done or not

0        key feature alisha solid woman cycling short c...
1        fabhomedecor fabric double sofa bed finish col...
2        key feature belly sandal wedge heel casuals be...
3        key feature alisha solid woman cycling short c...
4        specification sicons purpose arnica dog shampo...
                               ...                        
19995    buy walldesign small vinyl sticker online wall...
19996    buy wallmantra large vinyl sticker sticker onl...
19997    buy elite collection medium acrylic sticker on...
19998    buy elite collection medium acrylic sticker on...
19999    buy elite collection medium acrylic sticker on...
Name: description, Length: 19998, dtype: object

# Finding unique no. of rows and "None" values in each of the 6 splitted categories 

In [9]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications,crawl_year,Month,cat_level_1,cat_level_2,cat_level_3,cat_level_4,cat_level_5,cat_level_6,discounted_percentage
0,0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23+00:00,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,key feature alisha solid woman cycling short c...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ...",2016,3,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts,Alisha Shorts,"Alisha Solid Women's Cycling Shorts""",62.1


In [10]:
print(data['cat_level_1'].value_counts().to_dict()) # printing the dict of uniques values in cat_level_1
print(len(data['cat_level_1'].value_counts())) # len of unique values
len(data[data['cat_level_1']=='None'])   #len of None values

{'Clothing': 6197, 'Jewellery': 3531, 'Footwear': 1227, 'Mobiles & Accessories': 1099, 'Automotive': 1012, 'Home Decor & Festive Needs': 929, 'Beauty and Personal Care': 710, 'Home Furnishing': 699, 'Kitchen & Dining': 647, 'Computers': 578, 'Watches': 530, 'Baby Care': 483, 'Tools & Hardware': 391, 'Toys & School Supplies': 330, 'Pens & Stationery': 313, 'Bags, Wallets & Belts': 265, 'Furniture': 180, 'Sports & Fitness': 166, 'Cameras & Accessories': 82, 'Home Improvement': 81, 'Health & Personal Care Appliances': 43, 'Sunglasses': 35, 'Gaming': 35, 'Pet Supplies': 30, 'Home & Kitchen': 24, 'Home Entertainment': 19, 'eBooks': 15, 'Eyewear': 10, 'Clovia Women\'s Full Coverage Bra"': 9, 'Lilliput Top Baby Girl\'s  Combo"': 8, 'Vishudh Printed Women\'s Straight Kurta"': 8, 'Olvin Aviator Sunglasses"': 7, 'Clovia Women\'s T-Shirt Bra"': 6, 'MASARA Solid Women\'s Straight Kurta"': 5, 'Sunglasses"': 5, 'FEET FLOW Women Flats"': 4, 'Pu-Good Women Flats"': 4, 'Household Supplies': 4, 'Olvin W

0

Category level 1 contains 266 labels and 0 None values

In [11]:
print(data['cat_level_2'].value_counts().to_dict())
print(len(data['cat_level_2'].value_counts()))
len(data[data['cat_level_2']=='None'])

{"Women's Clothing": 3900, "Men's Clothing": 1773, 'Necklaces & Chains': 1606, 'Accessories & Spare parts': 925, 'Tablet Accessories': 801, "Women's Footwear": 781, 'Bangles, Bracelets & Armlets': 724, 'Rings"': 710, 'Wrist Watches': 523, "Kids' Clothing": 520, 'Tools': 403, 'Fragrances': 391, 'Laptop Accessories': 343, "Men's Footwear": 340, 'None': 328, 'Mobile Accessories': 304, 'Showpieces': 303, 'Coffee Mugs': 302, 'School Supplies': 300, 'Wall Decor & Clocks': 278, 'Bed Linen': 217, 'Baby & Kids Gifts': 203, 'Network Components': 202, 'Bags': 195, 'Infant Wear': 187, 'Curtains & Accessories': 162, 'Jewellery Sets"': 161, 'Table Decor & Handicrafts': 150, 'Office Supplies': 140, 'Accessories': 114, "Kids' & Infant Footwear": 105, 'Bath Linen': 105, 'Cookware': 102, 'Cushions, Pillows & Covers': 94, 'Earrings"': 93, 'Decorative Lighting & Lamps': 87, 'Combos and Kits': 84, 'Camera Accessories': 82, 'Makeup': 78, 'Pet Furniture"': 76, 'Kitchen Tools': 74, 'Body and Skin Care': 74, '

328

Category level 2 contains 224 labels but 328 "None" values

In [12]:
print(data['cat_level_3'].value_counts().to_dict())
print(len(data['cat_level_3'].value_counts()))
len(data[data['cat_level_3']=='None'])

{'Western Wear': 1980, 'Necklaces"': 1567, 'None': 1457, 'Lingerie, Sleep & Swimwear': 1208, 'T-Shirts': 903, 'Cases & Covers': 796, 'Car Interior & Exterior': 677, 'Ethnic Wear': 485, 'Casual Shoes': 454, 'Bangles"': 430, 'Deodorants': 388, 'Gardening Tools': 343, 'Girls Wear': 287, 'Bracelets"': 251, 'Shirts': 234, 'Winter & Seasonal Wear': 225, 'Geometry & Pencil Boxes': 221, 'Routers': 199, 'Stickers': 194, 'Blankets, Quilts & Dohars"': 190, 'Wedges"': 186, 'Ethnic': 172, 'Boys Wear': 169, 'USB Gadgets': 154, 'Accessories & Combo Sets': 135, 'Curtains"': 131, 'Wall Decals & Stickers': 122, 'Brooches': 113, 'Showpieces': 112, 'Spares & Performance Parts': 111, "Baby Boys' Clothes": 99, 'Pots & Pans': 98, 'Heels"': 97, 'Prithish Coffee Mugs"': 94, 'Batteries': 93, 'Towels"': 93, 'Printland Coffee Mugs"': 89, "Baby Girls' Clothes": 88, 'Flats"': 87, 'Hand Bags': 85, 'Backpacks': 78, 'Inner Wear & Sleep Wear': 75, 'Clocks': 74, 'Fusion Wear': 73, 'Maxima Wrist Watches"': 73, 'For Boys'

1457

Category level 3 contains 900 labels and about 1457 "None" values

In [13]:
print(data['cat_level_4'].value_counts().to_dict())
print(len(data['cat_level_4'].value_counts()))
len(data[data['cat_level_4']=='None'])

{'None': 5875, 'Shirts, Tops & Tunics': 1248, 'Bras': 1036, 'Car Interior': 659, 'Dresses & Skirts': 620, 'Combos': 375, 'Plant Containers & Sets': 333, 'TheLostPuppy Cases & Covers"': 229, 'Leggings & Jeggings': 209, 'Kurtas & Kurtis': 202, 'DailyObjects Cases & Covers"': 144, 'Numero Uno T-Shirts"': 135, 'Formal Shirts': 128, 'Fabric': 122, 'Boots"': 118, 'Casual & Party Wear Shirts': 106, 'Enthopia Cases & Covers"': 101, 'Sweatshirts': 92, 'Ethnic Wear': 90, 'Woks & Kadhais': 89, 'Car Spare Parts': 88, 'Loafers"': 87, 'Oviyon T-Shirts"': 87, 'Pencil Boxes': 85, 'Okane T-Shirts"': 82, 'Religious Idols': 80, 'Wallmantra Stickers"': 75, 'Wall Clocks': 74, 'Sweaters': 71, 'DeStudio Wall Decals & Stickers"': 71, 'Ties': 70, 'WallDesign Stickers"': 65, 'Pizza Cutters': 61, 'Dungarees & Jumpsuits': 58, 'Polos & T-Shirts': 51, 'Nimya T-Shirts"': 50, 'Baby Boys': 49, 'Innerwear & Sleepwear': 47, 'Car Mobile Accessories': 46, 'Northern Lights T-Shirts"': 46, 'Camisoles & Slips': 45, 'Ocean Ra

5875

Category level 4 contains 2372 labels and about 1/4th "None" values of given data

In [14]:
print(data['cat_level_5'].value_counts().to_dict())
print(len(data['cat_level_5'].value_counts()))
len(data[data['cat_level_5']=='None'])

{'None': 10641, 'Shirts': 652, 'Dresses': 534, 'Tops': 523, 'Car Mats"': 522, 'Plant Container Sets"': 332, 'S4S Bras"': 197, 'Leggings & Jeggings': 184, 'Kurtis': 165, 'Grafion Bras"': 101, 'Polos & T-Shirts': 86, 'Skirts': 77, 'Blouse Material': 70, 'Ploomz Bras"': 70, 'Car Sun Shades': 68, 'Tia by Ten on Ten Bras"': 51, 'Kurtas': 48, 'Dungarees': 47, 'Vehicle Horns"': 47, 'Car Mobile Holders"': 46, 'Ethnic Sets': 41, 'Trousers': 41, 'Regent Wall Clocks"': 40, 'Denver Combos"': 39, 'Jeans': 39, 'Younky Bras"': 37, 'Vivity Bras"': 37, 'Tunics': 36, 'Shorts': 35, 'Vaishna Bras"': 35, 'Glus Bras"': 31, 'Candy House Solid Men\'s Polo Neck T-Shirt (Pack ..."': 30, 'Nike Combos"': 30, 'Vehicle Mirrors': 30, 'Status Bras"': 29, 'AdroitZ Mobile Holders': 28, 'GetAbhi Ties"': 27, 'Yardley Combos"': 27, 'Park Avenue Combos"': 26, 'Urbaano Bras"': 26, 'Engage Combos"': 25, 'Lovinoform Bras"': 25, 'Smartpro 19.5v,3.9a Replacement Charger for Vaio..."': 25, 'Alvaro Ties"': 23, 'Simrit Night Dress

10641

Category level 4 contains 2638 labels and about half "None" values of given data. So we can reject this for considering as primary label.

# Prediction of level 1 category with 266 labels



*   As there are lot of labels and "None" values in other levels of category. 
*   Considering Level 1 Category as Primary Category works here.



In [15]:
# Primary Category processing.
# Considering all the 266 labels to perform prediction
sort_level_1 = list(data.groupby('cat_level_1').count().sort_values(by='uniq_id',ascending=False).index) 
# Selecting only relevant columns that is cat_level_1 and description
processed_df = data[data['cat_level_1'].isin(sort_level_1)][['cat_level_1','description']]
# # Cleaning strings
processed_df['description'] = processed_df['description'].astype('str').apply(preprocess_string)
# Using LabelEncoder Encoding target labels with value between 0 and 265
le = preprocessing.LabelEncoder()
category_encoded=le.fit_transform(processed_df['cat_level_1'])
processed_df['cat_level_1'] = category_encoded

In [16]:
X_train, X_test, y_train, y_test = train_test_split(processed_df['description'],processed_df['cat_level_1'],test_size=0.2)

In [17]:
vect = CountVectorizer(stop_words = 'english')
X_train_matrix = vect.fit_transform(X_train) 

clf=MultinomialNB()  # Defining model
clf.fit(X_train_matrix, y_train)   # Training multinomial NB model 
print(clf.score(X_train_matrix, y_train)*100)   # Scoring the trained model 
X_test_matrix = vect.transform(X_test)  # Converting the test data
print (clf.score(X_test_matrix, y_test)*100)   # Scoring for the test data
predicted_result=clf.predict(X_test_matrix) 
print(classification_report(y_test,predicted_result)) 

93.63045380672584
91.325
              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.91      0.99      0.95       218
          26       0.86      0.66      0.75       100
          27       0.69      0.48      0.57        52
          29       0.87      0.95      0.91       137
          33       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          37       1.00      0.25      0.40        16
          38       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         1
          43       0.96      0.99      0.98      1192
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Cleaning cat_level_1
*   These lists contains a keywords from the labels to be clubbed to get 27 labels
*   This work is done manually looking at the primary column


---


Relabeling the cat_level_1 into 27 labels

In [18]:
Clothing = ['Socks','Jeans','Viscose ','Top','Kurta','Bra','Blazer','Shorts','Gloves','Lingerie','Sweater','Jacket','Trousers','Jumpsuit','Brief','Cotton','Creations',
            'T-Shirt','Camisole',"Bottom",'Leggings','Stole','Boxer','Cufflink','Round Neck','Dress','Sari','Vest','Panty','Semi-stitched','Neck','Silk','Girls','Sleeve','Kurti','Printed']
Sunglasses =['Eyewear','Sunglasses']
Jewellery = ['Jewels','Rings','Bangle','Swarovski']
Bags_Wallets_Belts = ['Clutch','Pouch','Backpack','Zixtro']
Footwear = ['Flats','Bellies','Shoes','Slippers','Wedges','Lace','Shoes,']
Automotive = ['Hyundai','horn','Car','Mirror','Side','Stand', 'Automation', 'Enfield', 'Bajaj', 'Renault', 'Vehicles','Maruti','Arm','Honda','BikerZ']
Pens_Stationery = ['Pen','Notebook', 'Paper', 'Compartments']
Kitchen_Dining = ['Bowl', 'Cooler','Oddy','Grocery','Glass']
Computers = ['Keyboard', 'Sound','Mixer','Surge']
Home_Furnishing = ['Floor','Mat','Table','Cover','Furnishing']
Furniture = ['Mattress', 'Sofa']
Beauty_and_Personal_Care = ['Boreal', 'Roller', 'Brush','Foundation', 'Hair', 'Band', 'Clip','Wella','Shaving']
Home_Decor_Festive_Needs = ['Tapestry', 'Plant', 'Showpiece', 'Lantern', 'Seed', 'Candle', 'Herb','Fragrance','Sugandh']
Health_Personal_Care_Appliances = ['Hair','Dryer', 'Nail' ,'Cutter','Nutrition']
Baby_Care = ['Walker']
Sports_Fitness = ['Thigh','Guard','Toe']
Cameras_Accessories = ['Binoculars','Vanguard','Pia']
Gaming = ['Game']
Mobiles = ['Headset', 'Tablet','Samsung','Blackberry','Planet','Charging']
tools = ['PUMP','Dremel','Betagard']
Home_Kitchen = ['Water']
Home_Improvement = ['BuildTrack','Cartridge','Household','Faucet']
Watches = ['Wearable']

This 3031 could not be clubbed with any of the primary category so dropping it

In [19]:
data = data.drop(3031, axis=0) # this is the only unclubbed row into 27 labels 

Renaming the labels 
*   In Primary category there are about 266 labels.
*   Renaming the labels which has only one item to appropriate labels.
*   At last, we will remain with 27 main labels.

In [20]:
for i,row in data.iterrows():
    sentence = row['cat_level_1']
    x = str(sentence).replace('"',' ')
    y = str(x).replace(']',' ')
    words = x.split()
    for j in words:
      if j in Clothing:
        data.loc[i,'cat_level_1_v2'] = 'Clothing'
        break
      elif j in Sunglasses:
        data.loc[i,'cat_level_1_v2'] = 'Sunglasses'
        break
      elif j in Jewellery:
        data.loc[i,'cat_level_1_v2'] = 'Jewellery'
        break
      elif j in Bags_Wallets_Belts:
        data.loc[i,'cat_level_1_v2'] = 'Bags, Wallets & Belts'
        break
      elif j in Footwear:
        data.loc[i,'cat_level_1_v2'] = 'Footwear'
        break
      elif j in Automotive:
        data.loc[i,'cat_level_1_v2'] = 'Automotive'
        break
      elif j in Pens_Stationery:
        data.loc[i,'cat_level_1_v2'] = 'Pens & Stationery'
        break
      elif j in Kitchen_Dining:
        data.loc[i,'cat_level_1_v2'] = 'Kitchen & Dining'
        break
      elif j in Computers:
        data.loc[i,'cat_level_1_v2'] = 'Computers'
        break
      elif j in Home_Furnishing:
        data.loc[i,'cat_level_1_v2'] = 'Home Furnishing'
        break
      elif j in Furniture:
        data.loc[i,'cat_level_1_v2'] = 'Furniture'
        break
      elif j in Beauty_and_Personal_Care:
        data.loc[i,'cat_level_1_v2'] = 'Beauty and Personal Care'
        break
      elif j in Home_Decor_Festive_Needs:
        data.loc[i,'cat_level_1_v2'] = 'Home Decor & Festive Needs'
        break
      elif j in Health_Personal_Care_Appliances:
        data.loc[i,'cat_level_1_v2'] = 'Health & Personal Care Appliances'
        break
      elif j in Baby_Care:
        data.loc[i,'cat_level_1_v2'] = 'Baby Care'
        break
      elif j in Sports_Fitness:
        data.loc[i,'cat_level_1_v2'] = 'Sports & Fitness'
        break
      elif j in Cameras_Accessories:
        data.loc[i,'cat_level_1_v2'] = 'Cameras & Accessories'
        break
      elif j in Gaming:
        data.loc[i,'cat_level_1_v2'] = 'Gaming'
        break
      elif j in Mobiles:
        data.loc[i,'cat_level_1_v2'] = 'Mobiles & Accessories'
        break
      elif j in tools:
        data.loc[i,'cat_level_1_v2'] = 'Tools & Hardware'
        break
      elif j in Home_Kitchen:
        data.loc[i,'cat_level_1_v2'] = 'Home & Kitchen'
        break
      elif j in Home_Improvement:
        data.loc[i,'cat_level_1_v2'] = 'Home Improvement'
        break
      elif j in Watches:
        data.loc[i,'cat_level_1_v2'] = 'Watches'
        break
      else:
        data.loc[i,'cat_level_1_v2'] = row['cat_level_1']

In [21]:
print(data['cat_level_1_v2'].value_counts().to_dict())
print(len(data['cat_level_1_v2'].value_counts()))
len(data[data['cat_level_1_v2']=='None'])

{'Clothing': 6341, 'Jewellery': 3545, 'Footwear': 1263, 'Mobiles & Accessories': 1110, 'Automotive': 1029, 'Home Decor & Festive Needs': 941, 'Beauty and Personal Care': 721, 'Home Furnishing': 707, 'Kitchen & Dining': 653, 'Computers': 582, 'Watches': 532, 'Baby Care': 484, 'Tools & Hardware': 395, 'Toys & School Supplies': 330, 'Pens & Stationery': 322, 'Bags, Wallets & Belts': 278, 'Furniture': 182, 'Sports & Fitness': 168, 'Cameras & Accessories': 88, 'Home Improvement': 88, 'Sunglasses': 67, 'Health & Personal Care Appliances': 46, 'Gaming': 36, 'Pet Supplies': 30, 'Home & Kitchen': 25, 'Home Entertainment': 19, 'eBooks': 15}
27


0

In [22]:
data.rename(columns = {'cat_level_1_v2': 'primary_category'}, inplace = True)

# Required Data for Implementation

In [23]:
load_data = data[['uniq_id','description','primary_category']]

# Saving the Cleaned and Processed Dataset

In [24]:
load_data.to_csv('/content/drive/MyDrive/MIDAS/2_cleaned_flipkart_com_ecommerce_sample.csv')

# Data preparation for CNN model

In [25]:
product = pd.read_csv("/content/drive/MyDrive/MIDAS/2_cleaned_flipkart_com_ecommerce_sample.csv")

In [26]:
product

Unnamed: 0.1,Unnamed: 0,uniq_id,description,primary_category
0,0,c2d766ca982eca8304150849735ffef9,key feature alisha solid woman cycling short c...,Clothing
1,1,7f7036a6d550aaa89d34c77bd39a5e48,fabhomedecor fabric double sofa bed finish col...,Furniture
2,2,f449ec65dcbc041b6ae5e6a32717d01b,key feature belly sandal wedge heel casuals be...,Footwear
3,3,0973b37acd0c664e3de26e97e5571454,key feature alisha solid woman cycling short c...,Clothing
4,4,bc940ea42ee6bef5ac7cea3fb5cfbee7,specification sicons purpose arnica dog shampo...,Pet Supplies
...,...,...,...,...
19992,19995,7179d2f6c4ad50a17d014ca1d2815156,buy walldesign small vinyl sticker online wall...,Baby Care
19993,19996,71ac419198359d37b8fe5e3fffdfee09,buy wallmantra large vinyl sticker sticker onl...,Baby Care
19994,19997,93e9d343837400ce0d7980874ece471c,buy elite collection medium acrylic sticker on...,Baby Care
19995,19998,669e79b8fa5d9ae020841c0c97d5e935,buy elite collection medium acrylic sticker on...,Baby Care


In [27]:
desc = product[['uniq_id','description']]

In [28]:
desc

Unnamed: 0,uniq_id,description
0,c2d766ca982eca8304150849735ffef9,key feature alisha solid woman cycling short c...
1,7f7036a6d550aaa89d34c77bd39a5e48,fabhomedecor fabric double sofa bed finish col...
2,f449ec65dcbc041b6ae5e6a32717d01b,key feature belly sandal wedge heel casuals be...
3,0973b37acd0c664e3de26e97e5571454,key feature alisha solid woman cycling short c...
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,specification sicons purpose arnica dog shampo...
...,...,...
19992,7179d2f6c4ad50a17d014ca1d2815156,buy walldesign small vinyl sticker online wall...
19993,71ac419198359d37b8fe5e3fffdfee09,buy wallmantra large vinyl sticker sticker onl...
19994,93e9d343837400ce0d7980874ece471c,buy elite collection medium acrylic sticker on...
19995,669e79b8fa5d9ae020841c0c97d5e935,buy elite collection medium acrylic sticker on...


In [29]:
desc.to_csv('/content/drive/MyDrive/MIDAS/desc.csv')

In [30]:
cat = product[['uniq_id','primary_category']]

In [31]:
cat

Unnamed: 0,uniq_id,primary_category
0,c2d766ca982eca8304150849735ffef9,Clothing
1,7f7036a6d550aaa89d34c77bd39a5e48,Furniture
2,f449ec65dcbc041b6ae5e6a32717d01b,Footwear
3,0973b37acd0c664e3de26e97e5571454,Clothing
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,Pet Supplies
...,...,...
19992,7179d2f6c4ad50a17d014ca1d2815156,Baby Care
19993,71ac419198359d37b8fe5e3fffdfee09,Baby Care
19994,93e9d343837400ce0d7980874ece471c,Baby Care
19995,669e79b8fa5d9ae020841c0c97d5e935,Baby Care


In [32]:
cat.to_csv('/content/drive/MyDrive/MIDAS/cat.csv')