**Text classification using neural networks**

In [1]:
!pip install cupy

Collecting cupy
[?25l  Downloading https://files.pythonhosted.org/packages/e8/6c/8e9be391b761b0f92fa83ea654e5f6b828c1b6c52990c37231bf2dd33c58/cupy-7.5.0.tar.gz (3.7MB)
[K     |████████████████████████████████| 3.7MB 2.7MB/s 
Building wheels for collected packages: cupy
  Building wheel for cupy (setup.py) ... [?25l[?25hdone
  Created wheel for cupy: filename=cupy-7.5.0-cp36-cp36m-linux_x86_64.whl size=30641754 sha256=925a8b7a90352ff10152189d6b5f903af8423ebe5cba3db393d50880a29e0c1b
  Stored in directory: /root/.cache/pip/wheels/1a/73/d8/5525c15eecc1ad1b2d695899b7119c32f64b4da391efe98ef8
Successfully built cupy
Installing collected packages: cupy
Successfully installed cupy-7.5.0


In [2]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import cupy as cp
from tqdm.auto import tqdm
import zipfile
review_zip = zipfile.ZipFile('movie_review.zip')
review_zip.extractall('') 
review_zip.close()

review_data = load_files(r"movie_review")
X, y = review_data.data, review_data.target

documents = []
stemmer = WordNetLemmatizer()
for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)     
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)    
    # Converting to Lowercase
    document = document.lower()    
    # Lemmatization
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)    
    documents.append(document)

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

class Sequence:
    def __init__(self, layers : list, rate, loss = 'binary'):
        self.Layer = layers
        self.Loss = LossSelector(loss)
        self.Rate = rate

    def predict(self, data):
        h = data
        for l in self.Layer:
            h = l.predict(h)
        return h

    def forward(self, data):
        h = data
        for l in self.Layer:
            h = l.forward(h)
        return h    

    def backpropagate(self, a, label):
        sigma = self.Loss.diff(label, a)
        sum = 0
        for l in reversed(self.Layer):
            sigma, a = l.backward(a, sigma, self.Rate)
            sum += (l.Layer * l.Layer).sum()
        return sum / (2 * a.shape[1]) * lamda

    def train(self, data, label, epoch, *test):
        train_e = []
        train_acc = []
        test_e = []
        test_acc = []
        for i in tqdm(range(epoch)):
            h = self.forward(data)
            if len(test) != 0 : t = self.predict(test[0])
            r = self.backpropagate(h, label)
            train_e.append(self.Loss.calc(label, h) + r)
            train_acc.append(accuracy(h > 0.5, label))
            if len(test) != 0:
                test_e.append(self.Loss.calc(test[1], t) + r)
                test_acc.append(accuracy(t > 0.5, test[1]))

        if len(test) == 0 : return train_e, train_acc
        return train_e, train_acc, test_e, test_acc

class Linear:
    def __init__(self, size, schedule = 'sgd', active = 'sigmoid', bias = False):
        self.size = size
        self.Layer = cp.random.normal(size = (size[0], size[1] + bias), scale = 4*cp.sqrt(2/(size[0] + size[1])))
        self.a = cp.zeros(size[0])
        self.bias = bias
        self.activation = ActivationSelector(active)
        self.scheduler = SchedulerSelector(schedule)

    def forward(self, data):
        self.a = generator(data, self.bias)
        return self.activation.calc(self.Layer@self.a)

    def predict(self, data):
        return self.activation.calc(self.Layer@generator(data, self.bias))

    def backward(self, a, sigma, rate):
        sigma = sigma*self.activation.diff(a)
        dx = sigma@self.a.transpose()/self.a.shape[1] + lamda * self.Layer / a.shape[1]
        t = rate*self.scheduler.grad(dx)
        self.Layer -= t
        return ((self.Layer.transpose()@sigma)[:len(self.a)-self.bias], self.a[:len(self.a)-self.bias])

def accuracy(pred, label):
  return (pred == label).mean()

def generator(data, bias):
  return data if not bias else cp.vstack((data, cp.ones((1, data.shape[1]))))


class Sigmoid:
    def __init__(self):
        pass

    def calc(self, x):
        return cp.reciprocal(1 + np.exp(-x))

    def diff(self, a):
        return a*(1 - a)

class ReLU:
    def __init__(self):
        pass

    def calc(self, x):
        return np.maximum(-0.1*x, x)

    def diff(self, x):
        return (x > 0) - (x <= 0)*0.1

def ActivationSelector(active):
    if active == 'sigmoid': return Sigmoid()
    if active == 'relu': return ReLU()
    print('ActivationSelector : not found %s' % active)

class BinaryLoss:
    def __init__(self):
        pass

    def calc(self, label, pred):
        return -((label*cp.log(pred) + (1 - label)*cp.log(1 - pred)).sum(axis=0)).mean()

    def diff(self, label, pred):
        return -label / pred + (1 - label) / (1 - pred)

def LossSelector(loss):
    if loss == 'binary': return BinaryLoss()    

class SGD():
    def __init__(self):
        pass

    def grad(self, dx):
        return dx

class RMSprop():
    beta = 0.9
    eps = 1e-8
    def __init__(self):
        self.cache = None

    def grad(self, dx):
        self.cache = (dx*dx) if self.cache is None else (RMSprop.beta * self.cache + (1 - RMSprop.beta) * dx*dx)
        return dx/(cp.sqrt(self.cache) + RMSprop.eps)

def SchedulerSelector(schedule):
    if schedule == 'sgd': return SGD()
    if schedule == 'rmsprop': return RMSprop()    

Model = Sequence([Linear((300, 1500), active = 'sigmoid', bias = True, schedule = 'rmsprop'),
                  Linear((1, 300),   active = 'sigmoid', bias = True, schedule = 'rmsprop')],
                 0.001, 'binary')

train_e, train_acc, test_e, test_acc= Model.train(X_train.T, y_train, 10000, X_test.T, y_test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


CUDARuntimeError: ignored

In [5]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.plot(range(0, len(test_e)), test_e, c = 'r')
plt.plot(range(0, len(train_e)), train_e, c = 'b')
plt.show()

NameError: ignored

<Figure size 720x720 with 0 Axes>