# Preprocess data

In [15]:
# Get data from file
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stop_words.update([',','\'','.','\"','...','`','#','$','%','&','*',';',':','/b','u','gt','lt','//','\'s','\'\'','-','reuter'])
classes = np.array([1,2,3,4])
training_filename = 'ag_news_csv/train.csv'
testing_filename = 'ag_news_csv/test.csv'
col_names = ['class','title','description']
training = pd.read_csv(training_filename, names=col_names)
testing = pd.read_csv(testing_filename, names=col_names)

tokenizer=nltk.tokenize.TreebankWordTokenizer()
stemmer = nltk.stem.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()

rgx_list = ['(\w+[A-Z]+.*\-+\s)','(\({1}\w+[\.*\s*\w*]*\){1})']

def token_stem_lem(text):
    # remove location of article before first - , and any source denoted by parenthesis
    new_text = text
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)
    words_stemmed_lemmed = []
    for word in tokenizer.tokenize(new_text):
        new_word = stemmer.stem(lemmer.lemmatize(word.lower()))
        if new_word not in stop_words :
            words_stemmed_lemmed.append(new_word)
    return words_stemmed_lemmed

def to_onehot(y, class_rng):
    res = []
    for i in class_rng:
        if y is i:
            res.append(1)
        else:
            res.append(0)
    return res

testing['title_proc'] = testing['title'].apply(lambda title: token_stem_lem(title))
testing['descrip_proc'] = testing['description'].apply(lambda desc: token_stem_lem(desc))
testing['onehot'] = testing['class'].apply(lambda y: to_onehot(y, range(1, classes.shape[0] + 1)))
training['title_proc'] = training['title'].apply(lambda title: token_stem_lem(title))
training['descrip_proc'] = training['description'].apply(lambda desc: token_stem_lem(desc))
training['onehot'] = training['class'].apply(lambda y: to_onehot(y, range(1, classes.shape[0] + 1)))

[nltk_data] Downloading package wordnet to /home/mike/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
training

Unnamed: 0,class,title,description,title_proc,descrip_proc,onehot
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","[wall, st., bear, claw, back, black]","[short-sel, wall, street, dwindling\band, ultr...","[0, 0, 1, 0]"
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,"[carlyl, look, toward, commerci, aerospac]","[privat, invest, firm, carlyl, group, \which, ...","[0, 0, 1, 0]"
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,"[oil, economi, cloud, stock, outlook]","[soar, crude, price, plu, worries\about, econo...","[0, 0, 1, 0]"
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,"[iraq, halt, oil, export, main, southern, pipe...","[author, halt, oil, export\flow, main, pipelin...","[0, 0, 1, 0]"
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","[oil, price, soar, all-tim, record, pose, new,...","[tearaway, world, oil, price, toppl, record, s...","[0, 0, 1, 0]"
...,...,...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,"[pakistan, musharraf, say, wo, n't, quit, armi...","[pakistani, presid, pervez, musharraf, ha, sai...","[1, 0, 0, 0]"
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,"[renteria, sign, top-shelf, deal]","[red, sox, gener, manag, theo, epstein, acknow...","[0, 1, 0, 0]"
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,"[saban, go, dolphin, yet]","[miami, dolphin, put, courtship, lsu, coach, n...","[0, 1, 0, 0]"
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,"[today, nfl, game]","[pittsburgh, ny, giant, time, 1:30, p.m., line...","[0, 1, 0, 0]"


# Feature Selection

Updates a dictionary of all words used and their term frequency per class as a list.

In [3]:
word_dict = {}
def update_word_dict(row, col):
    for word in row[col]:
        if not word in word_dict:
            word_dict[word] = [0,0,0,0]
        word_dict[word][row['class'] - 1] += 1

for index, row in training.iterrows():
    update_word_dict(row, 'descrip_proc')
    


The sort_top_words method will return the top n words as a dictionary where the value of the word is its place.

In [4]:
# Takes a very long time.
import operator
top_words_amount = 2000

def sort_top_words(word_dict, top_words_amount, clas):
    top_words = {}
    for i in range(top_words_amount):
        word = max(word_dict, key=lambda word: word_dict[word][clas])
        top_words[word] = i
        word_dict[word][clas] = 0
    return top_words
        
classes_amount = 4
top_words = [sort_top_words(word_dict, top_words_amount, class_num) for class_num in range(classes_amount)]

# Construct Examples

The following block uses the top_words dictionary as a vocabulary for the tfidf victorizer. The result is the tfidf value of each word in the dictionary corrisponding to its use in each sample article.

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_tokenizer(row):
    return row

tfidf = TfidfVectorizer(
    tokenizer=identity_tokenizer, 
    analyzer='word', 
    preprocessor=identity_tokenizer, 
    token_pattern=None,
    vocabulary=top_words[0]
)

tfidf.fit(training['descrip_proc'])
X_train = tfidf.transform(training['descrip_proc']).toarray()
Y_train = np.array(training['class'])
Y_train_oh = np.array(training['onehot'].tolist())
X_test = tfidf.transform(testing['descrip_proc']).toarray()
Y_test = np.array(testing['class'])
Y_test_oh = np.array(testing['onehot'].tolist())

# Classifiers

## Logistic Regression

The Logistic_regression function is my logistic regression algorithm which I attemt to adhere to the scikit-learn estimator scheme. By implementing the methods get_params, set_params, fit, predict and score, my logistic regression algorithm should be able to be used by GridsearchCV. It uses Mini-Batch Gradient Descent and Stochastic Gradient Descent.

In [44]:
# import math
import random
import math
from scipy import sparse
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator


class Logistic_regression(BaseEstimator):

    def __init__(self, 
                 epochs = 10, 
                 batch_samples = 5000, 
                 learning_rate=0.01, 
                 gd_type='mini-batch', 
                 lambda_reg=0.01,
                 verbose=True):
        self.epochs = epochs
        self.batch_samples = batch_samples
        self.learning_rate = learning_rate
        self.gd_type = gd_type
        self.lambda_reg = lambda_reg
        self.verbose = verbose
        
    def get_params(self, deep=True):
        params = {
                'epochs': self.epochs,
                'batch_samples': self.batch_samples,
                'learning_rate': self.learning_rate,
                'gd_type': self.gd_type,
                'lambda_reg': self.lambda_reg
            }
        return params

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        
    def to_onehot(self, y, classes):
        result = np.zeros((y.shape[0], classes.shape[0]))
        for i, row in enumerate(result):
            row[y[i] - 1] = 1
        return result
    
    def from_onehot(self, y):
        return np.array([np.argmax(i) for i in y.T])
        
    def get_phi(self, x):
        return self.w.dot(x)

    def softmax(self, x):
        phi = self.get_phi(x)
        phi_sum = np.zeros(phi.shape[1])
        for i in range(phi.shape[0]):
            phi_sum += np.exp(phi[i,:])
        return phi / phi_sum[None, :]

    def gradient(self, p, y, x):
        cross_entropy = p - y
        return cross_entropy.dot(x.T) + self.lambda_reg * self.w
    
    # accepts ground truth, y in one hot encoding and predicted vector p and returns float entropy
    def cross_entropy(self, y, p):
        p = np.array(list(map(lambda i: math.log(i,10), p)))
        return np.array(y).dot(p.T) * -1
    
    def find_loss(self,loss_list):
        return np.sum(loss_list) / len(loss_list)
    
    def mini_batch(self, x, y):
        for epoch in range(self.epochs):
            gradients = np.zeros(self.w.shape)
            xrand = np.random.randint(x.shape[1], size=self.batch_samples)
            x = np.array([x[:,i] for i in xrand]).T
            y = np.array([y[:,i] for i in xrand]).T
            p = self.softmax(x)
            dw = self.gradient(p, y, x)
            #self.losses.append(self.cross_entropy(y, p))
            score = self.score(y, p)
            self.w = self.w - self.learning_rate * dw
            if self.verbose: print('Epoch %d complete. Score: %1.4f' % (epoch, score))
        return self

    def stochastic(self, x, y):
        for epoch in range(self.epochs):
            for _ in range(self.batch_samples):
                i = np.random.randint(x.shape[0])
                p = self.softmax(x[i])
                grad = self.gradient(p, y[i], x[i])
                self.losses.append(self.cross_entropy(y[i], p))
                self.w = self.w - self.learning_rate * np.add(grad, np.multiply(self.lambda_reg, self.w))
            print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(self.losses[-self.batch_samples:])))
        return self
        
    def fit(self, x, y, classes=None):
        x = x.T
        if classes is None:
            y = y.T
        else:
            y = self.to_onehot(y, classes).T
        features = x.shape[0]
        classes = y.shape[0]
        self.w = np.zeros((classes, features))
        if self.gd_type == 'mini-batch':
            self.mini_batch(x, y)
        if self.gd_type == 'stochastic':
            self.stochastic(x, y)
        return self
    
    def predict(self, x):
        return self.softmax(x.T)
    
    def score(self, y, p):
        return accuracy_score(self.from_onehot(y), self.from_onehot(p))
    

In [26]:
lr = Logistic_regression(gd_type='mini-batch', learning_rate=0.01)
lr.fit(X_train, Y_train, classes)

(2000, 120000)
(4, 120000)
Epoch 0 complete. Score: 0.2456
Epoch 1 complete. Score: 0.7998
Epoch 2 complete. Score: 0.8228
Epoch 3 complete. Score: 0.8318
Epoch 4 complete. Score: 0.8484
Epoch 5 complete. Score: 0.8660
Epoch 6 complete. Score: 0.8630
Epoch 7 complete. Score: 0.8752
Epoch 8 complete. Score: 0.8780
Epoch 9 complete. Score: 0.8816


Logistic_regression()

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.utils.estimator_checks import check_estimator
from sklearn.model_selection import GridSearchCV

clf = svm.SVC(random_state=0)

ftwo_scorer = make_scorer(fbeta_score, beta=2)

params_to_be_compared = [{
        'epochs' : [10, 30, 50],
        'batch_samples' : [i * 1000 for i in range(1,9,2)],
        'learning_rate' : [1/10**i for i in range(1,6)],
        'gd_type' : ['mini-batch'],
        'lambda_reg' : [10**i/100 for i in range(1,5)],
        'verbose' : False
    }]

test_p = [{
        'epochs': [10,20]
    }]

print(params_to_be_compared[0].items())
lrgs = GridSearchCV(Logistic_regression(), param_grid=test_p, cv=5, scoring='accuracy')
lrgs_results = lrgs.fit(X=X_train, y=Y_train, classes=classes)

In [194]:
lrgs_results.best_score_

nan

In [45]:
from sklearn.model_selection import cross_validate

cvresults = cross_validate(Logistic_regression(), X_train, Y_train_oh, cv=3)
cvresults['test_score']

Epoch 0 complete. Score: 0.2478
Epoch 1 complete. Score: 0.7822
Epoch 2 complete. Score: 0.8040
Epoch 3 complete. Score: 0.8122
Epoch 4 complete. Score: 0.8330
Epoch 5 complete. Score: 0.8422
Epoch 6 complete. Score: 0.8604
Epoch 7 complete. Score: 0.8572
Epoch 8 complete. Score: 0.8636
Epoch 9 complete. Score: 0.8666


Traceback (most recent call last):
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_scorer.py", line 397, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "<ipython-input-44-c41c969617cf>", line 115, in score
    return accuracy_score(self.from_onehot(y), self.from_onehot(p))
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py", line 83, in _check_targets
    check_consistent_length(y_true, y_pred)
  File "/home/mike/.local/lib/python3.6

Epoch 0 complete. Score: 0.2534
Epoch 1 complete. Score: 0.8058
Epoch 2 complete. Score: 0.8208
Epoch 3 complete. Score: 0.8374
Epoch 4 complete. Score: 0.8452
Epoch 5 complete. Score: 0.8456
Epoch 6 complete. Score: 0.8448
Epoch 7 complete. Score: 0.8490
Epoch 8 complete. Score: 0.8536
Epoch 9 complete. Score: 0.8558


Traceback (most recent call last):
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_scorer.py", line 397, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "<ipython-input-44-c41c969617cf>", line 115, in score
    return accuracy_score(self.from_onehot(y), self.from_onehot(p))
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py", line 83, in _check_targets
    check_consistent_length(y_true, y_pred)
  File "/home/mike/.local/lib/python3.6

Epoch 0 complete. Score: 0.2612
Epoch 1 complete. Score: 0.8070
Epoch 2 complete. Score: 0.8136
Epoch 3 complete. Score: 0.8194
Epoch 4 complete. Score: 0.8256
Epoch 5 complete. Score: 0.8358
Epoch 6 complete. Score: 0.8428
Epoch 7 complete. Score: 0.8400
Epoch 8 complete. Score: 0.8526
Epoch 9 complete. Score: 0.8522


Traceback (most recent call last):
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_scorer.py", line 397, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "<ipython-input-44-c41c969617cf>", line 115, in score
    return accuracy_score(self.from_onehot(y), self.from_onehot(p))
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py", line 202, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/home/mike/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py", line 83, in _check_targets
    check_consistent_length(y_true, y_pred)
  File "/home/mike/.local/lib/python3.6

array([nan, nan, nan])

## Multilayer Perceptron Network

Along the lines of the logistic regression class, my Multilayer_Perceptron class also adheres to the scikit-learn estimator scheme.

In [210]:
class Multilayer_Perceptron():
    def __init__(self, epochs = 20, neurons = 50, batch_samples = 500, learning_rate=0.01):
        self.epochs = epochs
        self.neurons = neurons
        self.batch_samples = batch_samples
        self.learning_rate = learning_rate
        
    def get_params(self, deep=True):
        params = {
                'epochs' : self.epochs,
                'neurons' : self.neurons,
                'batch_samples' : self.batch_samples,
                'learning_rate' : self.learning_rate
            }
        return params
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def get_z(self, w, x, b):
        return w.T.dot(x) + b
    
    def sigmoid(self, z):
        return np.array(list(map(lambda z: 1 / (1 + math.exp(z * -1)), z))).reshape(z.shape)
    
    def sigmoid_gradient(self, z):
        return self.sigmoid(z) * (1 - self.sigmoid(z))
    
    def softmax(self, z):
        z_sum = 0
        for i in z:
            z_sum += math.exp(i)
        if z_sum == 0:
            print(z)
        return np.array([math.exp(ze)/z_sum for ze in z]).reshape(z.shape)
    
        # accepts ground truth, y in one hot encoding and predicted vector p and returns float entropy
    def cross_entropy(self, y, p):
        p = np.array(list(map(lambda i: math.log(i,10), p)))
        return np.array(y).dot(p.T) * -1
    
    def find_loss(self,loss_list):
        return np.sum(loss_list) / len(loss_list)
    
    def fit(self, x, y):
        features = x.shape[1]
        classes = len(y[0])
        self.w1 = np.zeros((features, self.neurons))
        self.b1 = np.zeros((self.neurons, 1))
        self.w2 = np.zeros((self.neurons, classes))
        self.b2 = np.zeros((classes, 1))
        losses = []
        for epoch in range(self.epochs):
            dw1 = np.zeros((w1.shape))
            db1 = np.zeros((b1.shape))
            dw2 = np.zeros((w2.shape))
            db2 = np.zeros((b2.shape))
            for _ in range(self.batch_samples):
                i = np.random.randint(len(x))
                
                yi = np.array(y[i]).reshape(len(y[i]), 1)
                xi = np.array(x[i]).reshape(x[i].shape[0], 1)
                
                #forward Prop
                z1 = self.get_z(w1, xi, b1) #neuronsx1
                h1 = self.sigmoid(z1)       #neuronsx1
                z2 = self.get_z(w2, h1, b2) #classesx1
                h2 = self.softmax(z2)       #classesx1
                
                #backprop                
                dJ_dz2 = h2 - yi            #classsesx1
                dw2 += dJ_dz2.dot(h1.T).T    #classesxneurons
                db2 += dJ_dz2                #classesx1
                dz2_dh1 = w2                #neuronsxclasses
                dJ_dh1 = dJ_dz2.T.dot(dz2_dh1.T) #1xneurons
                dh1_dz1 = self.sigmoid_gradient(h1) #neuronsx1
                dz1_dw1 = xi                #featuresx1
                
                # Update epoch summations
                db1 += dJ_dh1.T * dh1_dz1
                dw1 += dz1_dw1.dot(db1.T)
                losses.append(self.cross_entropy(y[i], h2))
            
            # Update weights
            w1 = np.subtract(w1, dw1 * self.learning_rate)
            b1 = np.subtract(b1, db1 * self.learning_rate)
            w2 = np.subtract(w2, dw2 * self.learning_rate)
            b2 = np.subtract(b2, db2 * self.learning_rate)
            print('Epoch %d complete. Traning loss: %1.4f' % (epoch, self.find_loss(losses[-self.batch_samples:])))
            
            
    def predict(self, x):
        xi = np.array(x[i]).reshape(x[i].shape[0], 1)

        #forward Prop
        z1 = self.get_z(w1, xi, b1) #neuronsx1
        h1 = self.sigmoid(z1)       #neuronsx1
        z2 = self.get_z(w2, h1, b2) #classesx1
        h2 = self.softmax(z2)       #classesx1
        return h2

    def score(self, x, y):
        p = self.predict(x)
        return self.cross_entropy(y, p)

In [166]:
mlp = Multilayer_Perceptron()
mlp.fit(X_train, Y_train) 

Epoch 0 complete. Traning loss: 0.6021
Epoch 1 complete. Traning loss: 1.0276
Epoch 2 complete. Traning loss: 5.6948
Epoch 3 complete. Traning loss: 1.7459
Epoch 4 complete. Traning loss: 0.7804
Epoch 5 complete. Traning loss: 0.6735
Epoch 6 complete. Traning loss: 0.6070
Epoch 7 complete. Traning loss: 0.5977
Epoch 8 complete. Traning loss: 0.6035
Epoch 9 complete. Traning loss: 0.6034
Epoch 10 complete. Traning loss: 0.6053
Epoch 11 complete. Traning loss: 0.6078
Epoch 12 complete. Traning loss: 0.6032
Epoch 13 complete. Traning loss: 0.6034
Epoch 14 complete. Traning loss: 0.6102
Epoch 15 complete. Traning loss: 0.6086
Epoch 16 complete. Traning loss: 0.6000
Epoch 17 complete. Traning loss: 0.6051
Epoch 18 complete. Traning loss: 0.6119
Epoch 19 complete. Traning loss: 0.6062
