# Data preprocessing v1:
This notebook contains code for preprocessing the raw data and creating a cleaned dataframe for model building. 


In [1]:
# import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

Load the data 

In [2]:
gcp_url = 'https://storage.googleapis.com/price_alchemy/Data/data.csv'
df = pd.read_csv(gcp_url)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 972406 entries, 0 to 972405
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   train_id           972406 non-null  int64  
 1   name               972406 non-null  object 
 2   item_condition_id  972406 non-null  int64  
 3   category_name      967865 non-null  object 
 4   brand_name         534339 non-null  object 
 5   price              972406 non-null  float64
 6   shipping           972406 non-null  int64  
 7   item_description   972403 non-null  object 
 8   created_at         972406 non-null  object 
 9   last_updated_at    972406 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 74.2+ MB


In [4]:
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,created_at,last_updated_at
0,793697,Plaid Vest,2,Women/Coats & Jackets/Vest,Old Navy,11.0,1,Green and blue. Very thick and soft! Perfect f...,2022-01-01 00:00:00,2022-01-01 00:00:00
1,402094,Women's Sperrys,3,Women/Shoes/Loafers & Slip-Ons,Sperrys,21.0,0,EUC,2022-01-01 00:01:00,2022-01-01 00:01:00
2,522439,Grey sweater dress,1,Women/Dresses/Other,Fashion Nova,20.0,1,This is a heather grey sweater dress from fash...,2022-01-01 00:01:00,2022-01-01 00:01:00
3,214455,Tory Burch 'Perry' Leather Wallet,3,Women/Women's Accessories/Wallets,Tory Burch,91.0,0,Tory Burch 'Perry' Leather Zip Continental Wal...,2022-01-01 00:03:00,2022-01-01 00:03:00
4,902755,Fujifilm Rainbow Instax Film,1,Electronics/Cameras & Photography/Film Photogr...,Fuji,14.0,0,No description yet,2022-01-01 00:05:00,2022-01-01 00:05:00


Basic Preprocessing

In [5]:
df=df.iloc[:100,:]

In [6]:
# Preprocessing steps
# 1. Remove rows with missing values in the 'price' column
m_df=df.dropna(subset=['price','category_name'])

# 2. Convert 'price' to numeric
m_df['price'] = pd.to_numeric(m_df['price'], errors='coerce')

# 3. Remove rows with price <= 0
m_df = m_df[m_df['price'] > 0]

# 4. Convert 'shipping' to categorical
m_df['shipping'] = m_df['shipping'].astype('category')

# 5. Convert 'item_condition_id' to categorical
m_df['item_condition_id'] = m_df['item_condition_id'].astype('category')

# 6. Drop created and updated at
try:
    m_df = m_df.drop(columns=['created_at', 'last_updated_at'])
except:
    pass

# 7. fill null text values
m_df['brand_name']=m_df['brand_name'].fillna('Not known')
m_df['name']=m_df['name'].fillna('No name')
m_df['item_description']=m_df['item_description'].fillna('No description yet')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_df['price'] = pd.to_numeric(m_df['price'], errors='coerce')


Let's try the split category function

In [7]:
def split_cat(category_str):

    # split by '/'
    l=category_str.split('/')

    # return list
    l_ret= l[:2]
    
    # rest of the categories
    rest= l[2:]
    rest_cat= '/'.join(rest)

    # add rest of categories as one category
    l_ret.append(rest_cat)
           
    return l_ret

In [8]:
split_cat('Electronics/Computers & Tablets/iPad/Tablet/eBook Access')

['Electronics', 'Computers & Tablets', 'iPad/Tablet/eBook Access']

Apply this function on the `category` column

In [9]:
m_df['category_split']= m_df['category_name'].apply(lambda x: split_cat(x))

In [10]:
m_df['parent_category']=m_df['category_split'].apply(lambda x: x[0])
m_df['child_category']=m_df['category_split'].apply(lambda x: x[1])
m_df['grandchild_category']=m_df['category_split'].apply(lambda x: x[2])

In [11]:
m_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_split,parent_category,child_category,grandchild_category
0,793697,Plaid Vest,2,Women/Coats & Jackets/Vest,Old Navy,11.0,1,Green and blue. Very thick and soft! Perfect f...,"[Women, Coats & Jackets, Vest]",Women,Coats & Jackets,Vest
1,402094,Women's Sperrys,3,Women/Shoes/Loafers & Slip-Ons,Sperrys,21.0,0,EUC,"[Women, Shoes, Loafers & Slip-Ons]",Women,Shoes,Loafers & Slip-Ons
2,522439,Grey sweater dress,1,Women/Dresses/Other,Fashion Nova,20.0,1,This is a heather grey sweater dress from fash...,"[Women, Dresses, Other]",Women,Dresses,Other
3,214455,Tory Burch 'Perry' Leather Wallet,3,Women/Women's Accessories/Wallets,Tory Burch,91.0,0,Tory Burch 'Perry' Leather Zip Continental Wal...,"[Women, Women's Accessories, Wallets]",Women,Women's Accessories,Wallets
4,902755,Fujifilm Rainbow Instax Film,1,Electronics/Cameras & Photography/Film Photogr...,Fuji,14.0,0,No description yet,"[Electronics, Cameras & Photography, Film Phot...",Electronics,Cameras & Photography,Film Photography


Selecting final columns

In [12]:
m_df=m_df[['train_id', 'name','item_condition_id','brand_name',
'parent_category','child_category','grandchild_category',
'shipping','item_description','price']]

Preprocess the text columns

In [13]:
def process_name(data):

    corpus=[]
    
    for i in tqdm(data):
   
        dis=i.lower() # converting into lower case
        corpus.append(dis)

    return corpus

In [14]:
def preprocess(data):
    
    corpus=[]
    ps=PorterStemmer()
    
    for i in tqdm(data):
        
        try:
            dis=re.sub(r'https?:\/\/.*[\r\n]*', '',i)  #removing hyperlinks
            dis=re.sub(r'http?:\/\/.*[\r\n]*', '',dis) #removing hyperlinks
            #dis=re.sub(r'\@[a-zA-Z0-9]\w+'," ",dis) # removing account mentions actually decreases the accuracy of the model 
            dis=re.sub('[^a-zA-Z]'," ",dis) #removing punctuation marks and numbers
            dis=dis.lower() # converting into lower case
            dis=dis.split() # splitting 
            dis=[ps.stem(word) for word in dis if not word in set(stopwords.words("english"))]  #stemming the words to trim down the number of words
            dis=' '.join(dis)
            corpus.append(dis)
            
        except:
            corpus.append(f"<BAD> {i}")
    
    return corpus

In [15]:
# process name column
raw_text= m_df['name'].to_list()
data_final= process_name(raw_text)

m_df['name']= data_final

# process item_description column
# raw_text= m_df['item_description'].to_list()
# data_final= preprocess(raw_text)

# m_df['item_description']= data_final

100%|██████████| 99/99 [00:00<00:00, 328769.67it/s]


In [16]:
m_df.drop(columns=['train_id'],inplace=True)

### Sklearn `ColumnTransformer`:

In [17]:
column_trans = ColumnTransformer([('categories', OrdinalEncoder(dtype='int'),['brand_name','parent_category', 'child_category', 'grandchild_category']),
('name', CountVectorizer(max_features=10000), 'name'),
('item_desc',TfidfVectorizer(max_features=10000),'item_description')
],
remainder='passthrough',
verbose_feature_names_out=True)

In [18]:
X=m_df.drop(columns=['price'])
y=m_df['price']

X=column_trans.fit_transform(X)

What is the shape of the final preprocessed sparse matrix?

In [19]:
X.shape

(99, 1420)