# Data preprocessing v1:
This notebook contains code for preprocessing the raw data and creating a cleaned dataframe for model building. 


In [2]:
# import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

Load the data 

In [3]:
gcp_url = 'https://storage.googleapis.com/price_alchemy/Data/data.csv'
df = pd.read_csv(gcp_url)

KeyboardInterrupt: 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482529 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [5]:
df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


Basic Preprocessing

In [6]:
# Preprocessing steps
# 1. Remove rows with missing values in the 'price' column
m_df=df.dropna(subset=['price','category_name'])

# 2. Convert 'price' to numeric
m_df['price'] = pd.to_numeric(m_df['price'], errors='coerce')

# 3. Remove rows with price <= 0
m_df = m_df[m_df['price'] > 0]

# 4. Convert 'shipping' to categorical
m_df['shipping'] = m_df['shipping'].astype('category')

# 5. Convert 'item_condition_id' to categorical
m_df['item_condition_id'] = m_df['item_condition_id'].astype('category')

# 6. Drop created and updated at
try:
    m_df = m_df.drop(columns=['created_at', 'last_updated_at'])
except:
    pass

# 7. fill null text values
m_df['brand_name']=m_df['brand_name'].fillna('Not known')
m_df['name']=m_df['name'].fillna('No name')
m_df['item_description']=m_df['item_description'].fillna('No description yet')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m_df['price'] = pd.to_numeric(m_df['price'], errors='coerce')


Let's try the split category function

In [7]:
def split_cat(category_str):

    # split by '/'
    l=category_str.split('/')

    # return list
    l_ret= l[:2]
    
    # rest of the categories
    rest= l[2:]
    rest_cat= '/'.join(rest)

    # add rest of categories as one category
    l_ret.append(rest_cat)
           
    return l_ret

In [8]:
split_cat('Electronics/Computers & Tablets/iPad/Tablet/eBook Access')

['Electronics', 'Computers & Tablets', 'iPad/Tablet/eBook Access']

Apply this function on the `category` column

In [9]:
m_df['category_split']= m_df['category_name'].apply(lambda x: split_cat(x))

In [10]:
m_df['parent_category']=m_df['category_split'].apply(lambda x: x[0])
m_df['child_category']=m_df['category_split'].apply(lambda x: x[1])
m_df['grandchild_category']=m_df['category_split'].apply(lambda x: x[2])

In [11]:
m_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_split,parent_category,child_category,grandchild_category
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,Not known,10.0,1,No description yet,"[Men, Tops, T-shirts]",Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,"[Electronics, Computers & Tablets, Components ...",Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,"[Women, Tops & Blouses, Blouse]",Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,Not known,35.0,1,New with tags. Leather horses. Retail for [rm]...,"[Home, Home Décor, Home Décor Accents]",Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,Not known,44.0,0,Complete with certificate of authenticity,"[Women, Jewelry, Necklaces]",Women,Jewelry,Necklaces


Selecting final columns

In [12]:
m_df=m_df[['train_id', 'name','item_condition_id','brand_name',
'parent_category','child_category','grandchild_category',
'shipping','item_description','price']]

Preprocess the text columns

In [13]:
def process_name(data):

    corpus=[]
    
    for i in tqdm(data):
   
        dis=i.lower() # converting into lower case
        corpus.append(dis)

    return corpus

In [14]:
def preprocess(data):
    
    corpus=[]
    ps=PorterStemmer()
    
    for i in tqdm(data):
        
        try:
            dis=re.sub(r'https?:\/\/.*[\r\n]*', '',i)  #removing hyperlinks
            dis=re.sub(r'http?:\/\/.*[\r\n]*', '',dis) #removing hyperlinks
            #dis=re.sub(r'\@[a-zA-Z0-9]\w+'," ",dis) # removing account mentions actually decreases the accuracy of the model 
            dis=re.sub('[^a-zA-Z]'," ",dis) #removing punctuation marks and numbers
            dis=dis.lower() # converting into lower case
            dis=dis.split() # splitting 
            dis=[ps.stem(word) for word in dis if not word in set(stopwords.words("english"))]  #stemming the words to trim down the number of words
            dis=' '.join(dis)
            corpus.append(dis)
            
        except:
            corpus.append(f"<BAD> {i}")
    
    return corpus

In [15]:
# process name column
raw_text= m_df['name'].to_list()
data_final= process_name(raw_text)

m_df['name']= data_final

# process item_description column
# raw_text= m_df['item_description'].to_list()
# data_final= preprocess(raw_text)

# m_df['item_description']= data_final

100%|██████████| 1475347/1475347 [00:00<00:00, 4827262.90it/s]


In [16]:
m_df.drop(columns=['train_id'],inplace=True)

### Sklearn `ColumnTransformer`:

In [25]:
column_trans = ColumnTransformer([('categories', OrdinalEncoder(dtype='int'),['brand_name','parent_category', 'child_category', 'grandchild_category']),
('name', CountVectorizer(max_features=10000), 'name'),
('item_desc',TfidfVectorizer(max_features=10000),'item_description')
],
remainder='passthrough',
verbose_feature_names_out=True)

In [26]:
X=m_df.drop(columns=['price'])
y=m_df['price']

X=column_trans.fit_transform(X)

What is the shape of the final preprocessed sparse matrix?

In [27]:
X.shape

(1475347, 20006)