### Importing Libraries

In [1]:
# Importing Libraries

import numpy as np
import pandas as pd


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#------ Additional Lib
import neattext as nt
import neattext.functions as nfx

from sklearn.feature_extraction.text import TfidfVectorizer   # Turning textual data into numeric for computation
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder               # For encoding categorical target attr
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn import svm   # Baseline

# ------- Validation metrics
from sklearn.metrics import accuracy_score    
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import hamming_loss                  
from sklearn.metrics import classification_report

### Data Preprocessing

In [2]:
nltk.download('stopwords')

def get_train_dataset():
    return pd.read_csv('data_train.csv', header=None, names=["country", "sku_id", "title", "category_lvl1","category_lvl2",
                                                       "category_lvl3", "description", "price", "type"])  

def preprocess(content):
    ps = PorterStemmer()
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    # Using str(content) because there are some float values in combined
    stemmed_content = re.sub('[^a-zA-Z]',' ',str(content))   # Dropping all encodings, numbers etc
    stemmed_content = re.sub(CLEANR, '',stemmed_content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


def feature_select(df):
    df.drop(['country','sku_id','price','type'],inplace=True,axis=1)
    df['combined'] = df['title']+" "+df['description']
    df.drop(['title', 'description'],inplace=True,axis=1)  #this line is not tested
    df['combined']
    return df
    
    

train_df = get_train_dataset()
train_df = feature_select(train_df)
train_df['combined'] = train_df['combined'].apply(preprocess)
train_df



[nltk_data] Downloading package stopwords to C:\Users\Computer
[nltk_data]     Point\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,category_lvl1,category_lvl2,category_lvl3,combined
0,Fashion,Women,Muslim Wear,adana galleri suri squar hijab light pink ul l...
1,Health & Beauty,Bath & Body,Hand & Foot Care,cuba heartbreak eau de parfum spray ml oz form...
2,"TV, Audio / Video, Gaming & Wearables",Audio,Live Sound & Stage,andoer cm cellphon smartphon mini dual head om...
3,Health & Beauty,Hair Care,Shampoos & Conditioners,anmyna complaint silki set shampoo ml conditio...
4,Health & Beauty,Men's Care,Body and Skin Care,argit argiltubo green clay face bodi ml ul li ...
...,...,...,...,...
36278,Computers & Laptops,Computer Accessories,Keyboards,sade k led backlit wire usb mechan game keyboa...
36279,Home Appliances,Large Appliances,Microwaves & Ovens,sona l electr oven seo ul li nbsp year warrant...
36280,Computers & Laptops,Computer Accessories,Speakers,op portabl wireless bluetooth speaker hand fre...
36281,Home & Living,Bedding,Pillows & Bolsters,woot woot tictacto pillow case white ul li cot...


In [10]:

def print_missing_stats(df):
    missing_val = df.isnull().sum()
    print(missing_val)
    total_cells = np.product(df.shape)
    missing_percent = (missing_val.sum()/total_cells) *100
    print(f'\nThe missing data percent is: {missing_percent}')

def preserve_label():    
    #np.argmax for decoding final predicted result
#     train_df['category_lvl3'].astype(str)
#     train_df['category_lvl3'] = train_df['category_lvl3'].values
    labels_c1=train_df['category_lvl1'].unique()
    labels_c2=train_df['category_lvl2'].unique()
    labels_c3=train_df['category_lvl3'].unique()
    return labels_c1, labels_c2, labels_c3
    

def encode_utility(data):
    encoder = LabelEncoder()
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
#     impute_ordinal = pd.get_dummies(impute_reshape).values # one hot encoding   #Not working
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

def encode(target):
    for columns in target:
        encode_utility(train_df[columns])

    
def impute():
    imputer = KNNImputer(n_neighbors = 5)
    df_imputed = np.round(imputer.fit_transform(train_df[['category_lvl1', 'category_lvl2', 'category_lvl3']]))
    return df_imputed

def clean_csv(df):
    df = pd.DataFrame(df, columns = ['category_lvl1','category_lvl2','category_lvl3'])
    df ['Title_desc'] = train_df['combined']
    df.to_csv('train_clean.csv')
    return df



print_missing_stats(train_df)
label_c1, label_c2, label_c3 = preserve_label()
print(label_c1)
print(label_c2)
print(label_c3)
encode(['category_lvl1', 'category_lvl2', 'category_lvl3'])
df_imputed = impute()
df_imputed = clean_csv(df_imputed)
print_missing_stats(df_imputed)





category_lvl1       0
category_lvl2       0
category_lvl3    2135
combined            0
dtype: int64

The missing data percent is: 1.4710746079431138
[2 3 7 1 0 4 8 6 5]
[56 3 1 23 36 29 7 28 27 55 33 35 54 0 16 10 24 44 21 5 19 52 8 43 47 4 46
 11 38 25 9 15 32 14 12 49 22 45 40 50 34 20 53 26 30 31 2 13 17 48 18 6
 41 39 51 42 37]
[113 67 95 147 23 165 91 162 37 179 2 46 149 106 125 26 180 49 170 111 nan
 9 59 17 143 27 24 103 31 172 99 55 78 14 58 15 50 150 64 10 124 145 152
 134 115 93 144 43 183 73 1 135 126 177 159 41 47 153 79 97 70 118 92 139
 39 87 7 171 94 48 131 52 11 105 120 108 83 63 71 69 128 122 133 65 34 182
 62 82 158 100 38 130 61 167 56 32 60 19 42 53 89 174 116 129 146 12 40 29
 0 148 28 25 21 140 160 157 132 68 18 30 85 112 161 175 96 109 90 76 164
 119 166 3 74 141 156 80 154 136 20 137 107 142 44 86 51 75 163 181 123 54
 5 77 104 121 35 45 173 66 36 4 155 117 138 114 13 168 22 88 16 102 57 84
 72 151 110 169 8 6 178 33 176 127 98 101 81]


  y = column_or_1d(y, warn=True)


category_lvl1    0
category_lvl2    0
category_lvl3    0
Title_desc       0
dtype: int64

The missing data percent is: 0.0


In [7]:
def extract_features(df):
    # Extract features prior to encoding to retain categorical data
    X = df['Title_desc']
    Y1 = df['category_lvl1']
    Y2 = df['category_lvl2']
    Y3 = df['category_lvl3']
    return X,Y1,Y2,Y3


def tf_idf(X):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(X)
    X = vectorizer.transform(X)
    return X




X, Y1, Y2, Y3 = extract_features(df_imputed)
X = tf_idf(X)



NameError: name 'df_imputed' is not defined