In [12]:
import pandas as pd
import numpy as np
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scripts import tree_utils
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load

import tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, SpatialDropout1D, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv=None
            )
df = pd.read_csv('data/products_v1.csv')

def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)

In [3]:
cat = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv'
            ) 
y = cat['leaf']

In [4]:
name = df['name']
description = df['description']
name_desc = df[['name', 'description']]
name_and_description = df['name_and_description']
X = name

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

In [38]:
tfid_vectorizer = TfidfVectorizer(max_features=5000, 
                                  ngram_range=(1, 2),
                                  use_idf=False,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                 ) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [39]:
X_train = X_train.toarray()
X_test = X_test.toarray()
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1],1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1],1)

In [40]:
y_train = to_categorical(np.asarray(y_train.factorize()[0]),y_train.nunique())
y_test = to_categorical(np.asarray(y_test.factorize()[0]),y_test.nunique())

In [42]:
tf_len = len(tfid_vectorizer.vocabulary_)

In [69]:
input = Input(batch_shape=(None, tf_len, 1))
drop20 = SpatialDropout1D(0.3)(input)
conv2 = Conv1D(filters=128, kernel_size=5, activation='relu')(drop20)
drop21 = Dropout(0.5)(conv2)
conv22 = Conv1D(filters=64, kernel_size=5, activation='relu')(drop21)
drop22 = Dropout(0.5)(conv22)
pool2 = MaxPooling1D(pool_size=2)(drop22)
flat2 = Flatten()(pool2)
dense = Dense(10,activation='elu')(flat2)
drop23 = Dropout(0.1)(dense)
dense2 = Dense(500,activation='elu')(drop23)
drop24 = Dropout(0.1)(dense2)
out = Dense(y.nunique(), activation='softmax')(drop24)
model = Model(input, out)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_21 (InputLayer)       [(None, 5000, 1)]         0         
                                                                 
 spatial_dropout1d_20 (Spati  (None, 5000, 1)          0         
 alDropout1D)                                                    
                                                                 
 conv1d_40 (Conv1D)          (None, 4996, 128)         768       
                                                                 
 dropout_68 (Dropout)        (None, 4996, 128)         0         
                                                                 
 conv1d_41 (Conv1D)          (None, 4992, 64)          41024     
                                                                 
 dropout_69 (Dropout)        (None, 4992, 64)          0         
                                                          

In [70]:
model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

KeyboardInterrupt: 