### Importing Libraries

In [51]:
# Importing Libraries

import numpy as np
import pandas as pd


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#------ Additional Lib
import neattext as nt
import neattext.functions as nfx

from sklearn.feature_extraction.text import TfidfVectorizer   # Turning textual data into numeric for computation
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder               # For encoding categorical target attr
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn import svm   # Baseline

# ------- Validation metrics
from sklearn.metrics import accuracy_score    
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import hamming_loss                  
from sklearn.metrics import classification_report

### Data Preprocessing

In [53]:
nltk.download('stopwords')

def get_train_dataset():
    return pd.read_csv('data_train.csv', header=None, names=["country", "sku_id", "title", "category_lvl1","category_lvl2",
                                                       "category_lvl3", "description", "price", "type"])  

def preprocess(content):
    ps = PorterStemmer()
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    # Using str(content) because there are some float values in combined
    stemmed_content = re.sub('[^a-zA-Z]',' ',str(content))   # Dropping all encodings, numbers etc
    stemmed_content = re.sub(CLEANR, '',stemmed_content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


def feature_select(df):
    df.drop(['country','sku_id','price','type'],inplace=True,axis=1)
    df['combined'] = df['title']+" "+df['description']
    df.drop(['title', 'description'],inplace=True,axis=1)  #this line is not tested
    df['combined']
    return df
    
    

train_df = get_train_dataset()
train_df = feature_select(train_df)
train_df['combined'] = train_df['combined'].apply(preprocess)
train_df



[nltk_data] Downloading package stopwords to C:\Users\Computer
[nltk_data]     Point\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,category_lvl1,category_lvl2,category_lvl3,description,combined
0,Adana Gallery Suri Square Hijab – Light Pink,Fashion,Women,Muslim Wear,<ul><li>Material : Non sheer shimmer chiffon</...,adana galleri suri squar hijab light pink ul l...
1,Cuba Heartbreaker Eau De Parfum Spray 100ml/3.3oz,Health & Beauty,Bath & Body,Hand & Foot Care,Formulated with oil-free hydrating botanicals/...,cuba heartbreak eau de parfum spray ml oz form...
2,Andoer 150cm Cellphone Smartphone Mini Dual-He...,"TV, Audio / Video, Gaming & Wearables",Audio,Live Sound & Stage,<ul> <li>150cm mini microphone compatible for ...,andoer cm cellphon smartphon mini dual head om...
3,ANMYNA Complaint Silky Set 柔顺洗发配套 (Shampoo 520...,Health & Beauty,Hair Care,Shampoos & Conditioners,<ul> <li>ANMYNA Complaint Silky Set (Shampoo 5...,anmyna complaint silki set shampoo ml conditio...
4,Argital Argiltubo Green Clay For Face and Body...,Health & Beauty,Men's Care,Body and Skin Care,<ul> <li>100% Authentic</li> <li>Rrefresh and ...,argit argiltubo green clay face bodi ml ul li ...
...,...,...,...,...,...,...
36278,SADES K10 LED Backlit Wired USB Mechanical Gam...,Computers & Laptops,Computer Accessories,Keyboards,<ul> <li>No driver needed.Blue Switches is the...,sade k led backlit wire usb mechan game keyboa...
36279,SONA 20L Electric Oven SEO 2220,Home Appliances,Large Appliances,Microwaves & Ovens,<ul> <li>&nbsp;2 Years Warranty<br></li> <li>T...,sona l electr oven seo ul li nbsp year warrant...
36280,OP1001 Portable Wireless Bluetooth 2.1 Speaker...,Computers & Laptops,Computer Accessories,Speakers,"<ul> <li>With colorful lights on the button, l...",op portabl wireless bluetooth speaker hand fre...
36281,Woot-Woot TicTacToe Pillow Case (White),Home & Living,Bedding,Pillows & Bolsters,<ul> <li>100% Cotton</li> <li>Safe for Sensiti...,woot woot tictacto pillow case white ul li cot...


In [58]:

def print_missing_stats(df):
    missing_val = df.isnull().sum()
    print(missing_val)
    total_cells = np.product(df.shape)
    missing_percent = (missing_val.sum()/total_cells) *100
    print(f'\nThe missing data percent is: {missing_percent}')

    

def encode_utility(data):
    encoder = LabelEncoder()
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
#     impute_ordinal = pd.get_dummies(impute_reshape).values # one hot encoding   #Not working
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

def encode(target):
    for columns in target:
        encode_utility(train_df[columns])

    
def impute():
    imputer = KNNImputer(n_neighbors = 5)
    df_imputed = imputer.fit_transform(train_df[['category_lvl1', 'category_lvl2', 'category_lvl3']])
    return df_imputed

def clean_csv(df):
    df = pd.DataFrame(df, columns = ['category_lvl1','category_lvl2','category_lvl3'])
    df ['Title_desc'] = train_df['combined']
    df.to_csv('train_clean.csv')
    return df



print_missing_stats(train_df)
encode(['category_lvl1', 'category_lvl2', 'category_lvl3'])
df_imputed = impute()
df_imputed = clean_csv(df_imputed)
print_missing_stats(df_imputed)





title               0
category_lvl1       0
category_lvl2       0
category_lvl3    2135
description        33
combined            0
dtype: int64

The missing data percent is: 0.9958750195224577


  y = column_or_1d(y, warn=True)


category_lvl1    0
category_lvl2    0
category_lvl3    0
Title_desc       0
dtype: int64

The missing data percent is: 0.0


In [62]:
def extract_features(df):
    # Extract features prior to encoding to retain categorical data
    X = df['Title_desc']
    Y1 = df['category_lvl1']
    Y2 = df['category_lvl2']
    Y3 = df['category_lvl3']
    return X,Y1,Y2,Y3


def tf_idf(X):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(X)
    X = vectorizer.transform(X)
    return X


def decode():    
    #np.argmax for decoding final predicted result
    labels_c1=sorted(df_imputed['category_lvl1'].unique())
    labels_c2=sorted(df_imputed['category_lvl2'].unique())
    labels_c3=sorted(df_imputed['category_lvl3'].unique())
    return labels_c1, labels_c2, labels_c3


X, Y1, Y2, Y3 = extract_features(df_imputed)
X = tf_idf(X)
labels_c1, labels_c2, labels_c3 = decode()
