# Preprocess data

In [1]:
# Get data from file
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stop_words.update([',','\'','.','\"','...','`','#','$','%','&','*',';',':','/b','u','gt','lt','//','\'s','\'\'','-','reuter'])
classes = 4
training_filename = 'ag_news_csv/train.csv'
testing_filename = 'ag_news_csv/test.csv'
col_names = ['class','title','description']
training = pd.read_csv(training_filename, names=col_names)
testing = pd.read_csv(testing_filename, names=col_names)

tokenizer=nltk.tokenize.TreebankWordTokenizer()
stemmer = nltk.stem.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()

rgx_list = ['(\w+[A-Z]+.*\-+\s)','(\({1}\w+[\.*\s*\w*]*\){1})']

def token_stem_lem(text):
    # remove location of article before first - , and any source denoted by parenthesis
    new_text = text
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)
    words_stemmed_lemmed = []
    for word in tokenizer.tokenize(new_text):
        new_word = stemmer.stem(lemmer.lemmatize(word.lower()))
        if new_word not in stop_words :
            words_stemmed_lemmed.append(new_word)
    return words_stemmed_lemmed

def to_onehot(y, class_rng):
    res = []
    for i in class_rng:
        if y is i:
            res.append(1)
        else:
            res.append(0)
    return res

testing['title_proc'] = testing['title'].apply(lambda title: token_stem_lem(title))
testing['descrip_proc'] = testing['description'].apply(lambda desc: token_stem_lem(desc))
testing['onehot'] = testing['class'].apply(lambda y: to_onehot(y, range(1, classes + 1)))
training['title_proc'] = training['title'].apply(lambda title: token_stem_lem(title))
training['descrip_proc'] = training['description'].apply(lambda desc: token_stem_lem(desc))
training['onehot'] = training['class'].apply(lambda y: to_onehot(y, range(1, classes + 1)))

[nltk_data] Downloading package wordnet to /home/mike/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
training

Unnamed: 0,class,title,description,title_proc,descrip_proc,onehot
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","[wall, st., bear, claw, back, black]","[short-sel, wall, street, dwindling\band, ultr...","[0, 0, 1, 0]"
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,"[carlyl, look, toward, commerci, aerospac]","[privat, invest, firm, carlyl, group, \which, ...","[0, 0, 1, 0]"
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,"[oil, economi, cloud, stock, outlook]","[soar, crude, price, plu, worries\about, econo...","[0, 0, 1, 0]"
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,"[iraq, halt, oil, export, main, southern, pipe...","[author, halt, oil, export\flow, main, pipelin...","[0, 0, 1, 0]"
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","[oil, price, soar, all-tim, record, pose, new,...","[tearaway, world, oil, price, toppl, record, s...","[0, 0, 1, 0]"
...,...,...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,"[pakistan, musharraf, say, wo, n't, quit, armi...","[pakistani, presid, pervez, musharraf, ha, sai...","[1, 0, 0, 0]"
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,"[renteria, sign, top-shelf, deal]","[red, sox, gener, manag, theo, epstein, acknow...","[0, 1, 0, 0]"
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,"[saban, go, dolphin, yet]","[miami, dolphin, put, courtship, lsu, coach, n...","[0, 1, 0, 0]"
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,"[today, nfl, game]","[pittsburgh, ny, giant, time, 1:30, p.m., line...","[0, 1, 0, 0]"


# Feature Selection

In [3]:
word_dict = {}
def update_word_dict(row, col):
    for word in row[col]:
        if not word in word_dict:
            word_dict[word] = [0,0,0,0]
        word_dict[word][row['class'] - 1] += 1

for index, row in training.iterrows():
    update_word_dict(row, 'descrip_proc')
    


In [4]:
# Takes a very long time.
import operator
top_words_amount = 2000

def reduce_word_dict(wd, key):
    r = dict(wd)
    del r[key]
    return r

def sort_top_words(word_dict, top_words_amount, clas):
    top_words = {}
    for i in range(top_words_amount):
        word = max(word_dict, key=lambda word: word_dict[word][clas])
        top_words[word] = i
        word_dict[word][clas] = 0
    return top_words
        
classes_amount = 4
top_words = [sort_top_words(word_dict, top_words_amount, class_num) for class_num in range(classes_amount)]

# Construct Examples

In [113]:
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_tokenizer(row):
    return row

tfidf = TfidfVectorizer(
    tokenizer=identity_tokenizer, 
    analyzer='word', 
    preprocessor=identity_tokenizer, 
    token_pattern=None,
    vocabulary=top_words[0]
)
tfidf.fit(training['descrip_proc'])
X_train = tfidf.transform(training['descrip_proc']).toarray()
Y_train = training['onehot']
X_test = tfidf.transform(testing['descrip_proc']).toarray()
Y_test = testing['onehot']


# Classifiers

## Logistic Regression

In [124]:
# import math
import random
from scipy import sparse
from sklearn.model_selection import GridSearchCV


class Logistic_regression():

    def __init__(self, classes, epochs = 10, validation_section = 0, batch_samples = 5000, learning_rate=0.01, gd_type='mini-batch', lambda_reg=0.01):
        self.classes = classes
        self.epochs = epochs
        self.validation_section = validation_section
        self.batch_samples = batch_samples
        self.learning_rate = learning_rate
        self.gd_type = gd_type
        self.lambda_reg = lambda_reg
        self.losses = []
        
    def get_params(self, deep=True):
        params = [{
                'batch_samples': self.batch_samples,
                'learning_rate': self.learning_rate,
                'gd_type': self.gd_type,
                'lambda_reg': self.lambda_reg
            }]
        return params
        
    def get_phi(self, x):
        return self.w.dot(x.T)

    def softmax(self, x):
        phi_sum = 0
        phi = self.get_phi(x)
        for i in phi:
            phi_sum += math.exp(i)
        return [math.exp(phie)/phi_sum for phie in phi]

    def gradient(self, p, y, x):
        cross_entropy = np.subtract(p,y)
        x = np.array(x)
        return np.asarray([i * x for i in cross_entropy])
    
    # accepts ground truth, y in one hot encoding and predicted vector p and returns float entropy
    def cross_entropy(self, y, p):
        p = np.array(list(map(lambda i: math.log(i,10), p)))
        return np.array(y).dot(p.T) * -1
    
    def find_loss(self,loss_list):
        return np.sum(loss_list) / len(loss_list)
    
    def mini_batch(self, x, y):
        for epoch in range(self.epochs):
            gradients = np.zeros(self.w.shape)
            for _ in range(self.batch_samples):
                i = np.random.randint(len(x))
                p = self.softmax(x[i])
                gradients = np.add(gradients, self.gradient(p, y[i], x[i]))
                self.losses.append(self.cross_entropy(y[i], p))
            reg = np.multiply(self.lambda_reg, self.w)
            self.w = self.w - self.learning_rate * np.add(gradients, reg)
            print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(self.losses[-self.batch_samples:])))

    def stochastic(self, x, y):
        for epoch in range(self.epochs):
            for _ in range(self.batch_samples):
                i = np.random.randint(len(x))
                p = self.softmax(x[i])
                grad = self.gradient(p, y[i], x[i])
                self.losses.append(self.cross_entropy(y[i], p))
                self.w = self.w - self.learning_rate * np.add(grad, np.multiply(self.lambda_reg, self.w))
            print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(self.losses[-self.batch_samples:])))

        
    def fit(self, x,y):
        features = x.shape[1]
        self.w = np.zeros((classes, features))
        val_range_min = 0
        val_range_max = 0
        if self.gd_type == 'mini-batch':
            self.mini_batch(x, y)
        if self.gd_type == 'stochastic':
            self.stochastic(x, y)
        return self
    
    def predict(self, x):
        return self.softmax(x)
    
    def score(self, x, y):
        p = self.predict(x)
        return self.cross_entropy(y, p)

In [125]:
lr = Logistic_regression(classes, gd_type='mini-batch', learning_rate=0.01)
lr.fit(X_train, Y_train)

Epoch 0 complete. Traning loss: 0.6021
Epoch 1 complete. Traning loss: 0.5613
Epoch 2 complete. Traning loss: 0.5259
Epoch 3 complete. Traning loss: 0.4961
Epoch 4 complete. Traning loss: 0.4698
Epoch 5 complete. Traning loss: 0.4507
Epoch 6 complete. Traning loss: 0.4338
Epoch 7 complete. Traning loss: 0.4142
Epoch 8 complete. Traning loss: 0.4031
Epoch 9 complete. Traning loss: 0.3887


<__main__.Logistic_regression at 0x7fa4c3de6470>

In [None]:
params_to_be_compared = [{
        'batch_samples':[i * 10000 for i in range(0,9,2)],
        'learning_rate':[1/10**i for i in range(1,6)],
        'gd_type':['mini-batch'],
        'lambda_reg':[10**i/100 for i in range(1,5)]
    }]
lrgs = GridSearchCV(Logistic_regression(4), param_grid=params_to_be_compared, scoring='accuracy')
lrgs_results = lrgs.fit(X_train, Y_train)

In [None]:
def old_mini_batch():
    for epoch in range(self.epochs):
        epoch_range_min = len(x)/self.epochs * epoch
        epoch_range_max = len(x)/self.epochs * (epoch + 1)
        if epoch == self.validation_section:
            val_range_min = epoch_range_min
            val_range_max = epoch_range_max
        else:
            for _ in range(self.batch_samples):
                i = random.randint(epoch_range_min, epoch_range_max - 1)
                p = self.softmax(x[i])
                grad = self.gradient(p, y[i], x[i])
                self.losses.append(self.cross_entropy(y[i], p))
                self.w = self.w - self.learning_rate * np.add(grad, np.multiply(self.lambda_reg, self.w))
            print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(self.losses[-self.batch_samples:])))
    # validation batch
    for _ in range(self.batch_samples):
        i = random.randint(epoch_range_min, epoch_range_max - 1)
        p = self.softmax(x[i])
        grad = self.gradient(p, y[i], x[i])
        self.losses.append(self.cross_entropy(y[i], p))
    print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(self.losses[-self.batch_samples:])))


## Multilayer Perceptron Network

In [214]:
class Multilayer_Perceptron():
    def __init__(self, epochs = 10, neurons = 50, batch_samples = 5000, learning_rate=0.01):
        self.epochs = epochs
        self.neurons = neurons
        self.batch_samples = batch_samples
        self.learning_rate = learning_rate
        
    def get_params(self, deep=True):
        params = [{
                'epochs' : self.epochs,
                'neurons' : self.neurons,
                'batch_samples' : self.batch_samples,
                'learning_rate' : self.learning_rate
            }]
        return params
    
    def get_z(self, w, x):
        return w.dot(x.T)
    
    def sigmoid(self, z):
        return np.array(list(map(lambda z: 1 / (1 + math.exp(z * -1)), z)))
    
    def softmax(self, z):
        z_sum = 0
        for i in z:
            z_sum += math.exp(i)
        return [math.exp(ze)/z_sum for ze in z]
    
        # accepts ground truth, y in one hot encoding and predicted vector p and returns float entropy
    def cross_entropy(self, y, p):
        p = np.array(list(map(lambda i: math.log(i,10), p)))
        return np.array(y).dot(p.T) * -1
    
    def find_loss(self,loss_list):
        return np.sum(loss_list) / len(loss_list)
    
    def fit(self, x, y):
        w1 = np.zeros((self.neurons, x.shape[1]))
        w2 = np.zeros((len(y[0]), self.neurons))
        w1g = np.zeros((w1.shape))
        w2g = np.zeros((w2.shape))
        losses = []
        for epoch in range(self.epochs):
            for _ in range(self.batch_samples):
                i = np.random.randint(len(x))
                #forward Prop
                z1 = self.get_z(w1, x[i])
                h1 = self.sigmoid(z1)
                z2 = self.get_z(w2, h1)
                h2 = self.softmax(z2)
                #backprop
                yi = np.array(y[i]).reshape(len(y[i]),1)
                w2_grad = yi.dot(h1.reshape(1, h1.shape[0]))
                w1_grad_sch = yi.T.dot(w2).dot(h1)
                w1_grad = np.array([h1e * w1_grad_sch * -1 for h1e in h1]).reshape(h1.shape[0],1).dot(x[i].reshape(x[i].shape[0],1).T)
                w1g = np.add(w1g, w1_grad)
                w2g = np.add(w2g, w2_grad)
                losses.append(self.cross_entropy(y[i], h2))
            w1 = np.subtract(w1, w1g * self.learning_rate)
            w2 = np.subtract(w2, w2g * self.learning_rate)
            print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(losses[-self.batch_samples:])))


In [215]:
mlp = Multilayer_Perceptron()
mlp.fit(X_train, Y_train) 

(50, 2000)
Epoch 0 complete. Traning loss: 0.6021
(50, 2000)
Epoch 1 complete. Traning loss: 1.2039
(50, 2000)
Epoch 2 complete. Traning loss: 0.6021


ZeroDivisionError: float division by zero

In [157]:
len(Y_train[0])

4