<a href="https://colab.research.google.com/github/mariammaher550/text-classification-sepcnn/blob/main/text_classification_sepcnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

check labels , 0 is alcohol

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from keras_preprocessing import text, sequence
from nltk.corpus import stopwords
import string

from sklearn.model_selection import train_test_split

top_k = 20000
max_sequence_length = 500

In [None]:
def prepare_set():

    df = pd.read_csv('items_cat_processed_data.csv')
    data_text = df['title'].values.tolist()  # input
    data_labels = df['label'].values.tolist()  # output

    data_labels = np.array(data_labels)

    for i in range (len(data_text)):
        data_text[i] = str(data_text[i])


    train_text, test_text, train_labels, test_labels = train_test_split(data_text, data_labels, test_size=0.33)

    return ((train_text, train_labels), (test_text, test_labels))


def sequence_vectorization(train_text, validate_text):


    tokenizer = text.Tokenizer(num_words=top_k)
    tokenizer.fit_on_texts(train_text)

    x_train = tokenizer.texts_to_sequences(train_text)
    x_val = tokenizer.texts_to_sequences(validate_text)

    max_length = len(max(x_train, key=len))
    if max_length > max_sequence_length:
        max_length = max_sequence_length

    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    return x_train, x_val, tokenizer.word_index


def get_num_classes(labels):

    num_classes = max(labels) + 1
    missing_classes = [i for i in range(num_classes) if i not in labels]
    if len(missing_classes):
        raise ValueError('Missing samples with label value(s) '
                         '{missing_classes}. Please make sure you have '
                         'at least one sample for every label value '
                         'in the range(0, {max_class})'.format(
                            missing_classes=missing_classes,
                            max_class=num_classes - 1))

    if num_classes <= 1:
        raise ValueError('Invalid number of labels: {num_classes}.'
                         'Please make sure there are at least two classes '
                         'of samples'.format(num_classes=num_classes))
    return num_classes

In [None]:
from keras import models
from keras.layers import Embedding, Dropout, SeparableConv1D, MaxPooling1D, GlobalAveragePooling1D, Dense


def sepcnn_model(blocks,
                 filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 input_shape,
                 num_classes,
                 num_features, ):
  
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()

    model.add(Embedding(input_dim=num_features,
                        output_dim=embedding_dim,
                        input_length=input_shape[0]))

    for _ in range(blocks - 1):
        model.add(Dropout(rate=dropout_rate))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(MaxPooling1D(pool_size=pool_size))

    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(op_units, activation=op_activation))
    return model


def _get_last_layer_units_and_activation(num_classes):
    if num_classes == 2:
        activation = "sigmoid"
        units = 1
    else:
        activation = 'softmax'
        units = num_classes

    return units, activation


In [None]:

def train_sequence_model(data,
                         learning_rate=1e-3,
                         epochs=1000,
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.2,
                         embedding_dim=200,
                         kernel_size=3,
                         pool_size=3):

    (train_texts, train_labels), (val_texts, val_labels) = data


    num_classes = get_num_classes(train_labels)
    unexpected_labels = [v for v in val_labels if v not in range(num_classes)]
    if len(unexpected_labels):
        raise ValueError('Unexpected label values found in the validation set:'
                         ' {unexpected_labels}. Please make sure that the '
                         'labels in the validation set are in the same range '
                         'as training labels.'.format(
                             unexpected_labels=unexpected_labels))

    x_train, x_val, word_index = sequence_vectorization(
            train_texts, val_texts)

    num_features = min(len(word_index) + 1, top_k)

    model = sepcnn_model(blocks=blocks,
                                     filters=filters,
                                     kernel_size=kernel_size,
                                     embedding_dim=embedding_dim,
                                     dropout_rate=dropout_rate,
                                     pool_size=pool_size,
                                     input_shape=x_train.shape[1:],
                                     num_classes=num_classes,
                                     num_features=num_features)

    loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

    callbacks = [tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=2)]

    history = model.fit(
            x_train,
            train_labels,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, val_labels),
            verbose=2,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results
    history = history.history
    prediction = model.predict(val_texts[:10])
    tested = val_texts[:10]
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('sepcnn_model.h5')

    return history['val_acc'][-1], history['val_loss'][-1]



In [None]:
data = prepare_set()
train_sequence_model(data)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


133867 133867
Epoch 1/1000
701/701 - 56s - loss: 2.0588 - acc: 0.3182 - val_loss: 1.2754 - val_acc: 0.5694 - 56s/epoch - 80ms/step
Epoch 2/1000
701/701 - 54s - loss: 0.8335 - acc: 0.7614 - val_loss: 0.6552 - val_acc: 0.8231 - 54s/epoch - 77ms/step
Epoch 3/1000
701/701 - 56s - loss: 0.5237 - acc: 0.8609 - val_loss: 0.5264 - val_acc: 0.8641 - 56s/epoch - 79ms/step
Epoch 4/1000
701/701 - 54s - loss: 0.3806 - acc: 0.9012 - val_loss: 0.4815 - val_acc: 0.8785 - 54s/epoch - 76ms/step
Epoch 5/1000
701/701 - 53s - loss: 0.3118 - acc: 0.9191 - val_loss: 0.4845 - val_acc: 0.8797 - 53s/epoch - 75ms/step
Epoch 6/1000
701/701 - 53s - loss: 0.2685 - acc: 0.9299 - val_loss: 0.5050 - val_acc: 0.8805 - 53s/epoch - 75ms/step


ValueError: ignored