# Preprocess data

In [61]:
# Get data from file
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stop_words.update([',','\'','.','\"','...','`','#','$','%','&','*',';',':','/b','u','gt','lt','//','\'s','\'\'','-','reuter'])
classes = 4
training_filename = 'ag_news_csv/train.csv'
testing_filename = 'ag_news_csv/test.csv'
col_names = ['class','title','description']
training = pd.read_csv(training_filename, names=col_names)
testing = pd.read_csv(testing_filename, names=col_names)

tokenizer=nltk.tokenize.TreebankWordTokenizer()
stemmer = nltk.stem.PorterStemmer()
lemmer = nltk.stem.WordNetLemmatizer()

rgx_list = ['(\w+[A-Z]+.*\-+\s)','(\({1}\w+[\.*\s*\w*]*\){1})']

def token_stem_lem(text):
    # remove location of article before first - , and any source denoted by parenthesis
    new_text = text
    for rgx_match in rgx_list:
        new_text = re.sub(rgx_match, '', new_text)
    words_stemmed_lemmed = []
    for word in tokenizer.tokenize(new_text):
        new_word = stemmer.stem(lemmer.lemmatize(word.lower()))
        if new_word not in stop_words :
            words_stemmed_lemmed.append(new_word)
    return words_stemmed_lemmed

def to_onehot(y, class_rng):
    res = []
    for i in class_rng:
        if y is i:
            res.append(1)
        else:
            res.append(0)
    return res

testing['title_proc'] = testing['title'].apply(lambda title: token_stem_lem(title))
testing['descrip_proc'] = testing['description'].apply(lambda desc: token_stem_lem(desc))
testing['onehot'] = testing['class'].apply(lambda y: to_onehot(y, range(1, classes + 1)))
training['title_proc'] = training['title'].apply(lambda title: token_stem_lem(title))
training['descrip_proc'] = training['description'].apply(lambda desc: token_stem_lem(desc))
training['onehot'] = training['class'].apply(lambda y: to_onehot(y, range(1, classes + 1)))

[nltk_data] Downloading package wordnet to /home/mike/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
training

Unnamed: 0,class,title,description,title_proc,descrip_proc,onehot
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...","[wall, st., bear, claw, back, black]","[reuter, -, short-sel, wall, street, dwindling...","[0, 0, 1, 0]"
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,"[carlyl, look, toward, commerci, aerospac]","[reuter, -, privat, invest, firm, carlyl, grou...","[0, 0, 1, 0]"
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,"[oil, economi, cloud, stock, outlook]","[reuter, -, soar, crude, price, plu, worries\a...","[0, 0, 1, 0]"
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,"[iraq, halt, oil, export, main, southern, pipe...","[reuter, -, author, halt, oil, export\flow, ma...","[0, 0, 1, 0]"
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","[oil, price, soar, all-tim, record, pose, new,...","[tearaway, world, oil, price, toppl, record, s...","[0, 0, 1, 0]"
...,...,...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,"[pakistan, musharraf, say, wo, n't, quit, armi...","[pakistani, presid, pervez, musharraf, ha, sai...","[1, 0, 0, 0]"
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,"[renteria, sign, top-shelf, deal]","[red, sox, gener, manag, theo, epstein, acknow...","[0, 1, 0, 0]"
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,"[saban, go, dolphin, yet]","[miami, dolphin, put, courtship, lsu, coach, n...","[0, 1, 0, 0]"
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,"[today, nfl, game]","[pittsburgh, ny, giant, time, 1:30, p.m., line...","[0, 1, 0, 0]"


# Feature Selection

In [7]:
word_dict = {}
def update_word_dict(row, col):
    for word in row[col]:
        if not word in word_dict:
            word_dict[word] = [0,0,0,0]
        word_dict[word][row['class'] - 1] += 1

for index, row in training.iterrows():
    update_word_dict(row, 'descrip_proc')
    


In [8]:
# Takes a very long time.
import operator
top_words_amount = 2000

def reduce_word_dict(wd, key):
    r = dict(wd)
    del r[key]
    return r

def sort_top_words(word_dict, top_words_amount, clas):
    top_words = {}
    for i in range(top_words_amount):
        word = max(word_dict, key=lambda word: word_dict[word][clas])
        top_words[word] = i
        word_dict[word][clas] = 0
    return top_words
        
classes_amount = 4
top_words = [sort_top_words(word_dict, top_words_amount, class_num) for class_num in range(classes_amount)]

# Construct Examples

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_tokenizer(row):
    return row

tfidf = TfidfVectorizer(
    tokenizer=identity_tokenizer, 
    analyzer='word', 
    preprocessor=identity_tokenizer, 
    token_pattern=None,
    vocabulary=top_words[0]
)
tfidf.fit(training['descrip_proc'])
X_train = tfidf.transform(training['descrip_proc'])
Y_train = training['onehot']
X_test = tfidf.transform(testing['descrip_proc'])
Y_test = testing['onehot']


# Classifiers

## Logistic Regression

In [150]:
import math
import random
from scipy import sparse

def get_phi(w, x):
    return w.dot(x.T)

def softmax(phi):
    phi_sum = 0
    for i in phi:
        phi_sum += math.exp(i)
    return [math.exp(phie)/phi_sum for phie in phi]

def gradient(p, y, x):
    cross_entropy = np.subtract(p,y)
    x = np.array(x)
    return np.asarray([i * x for i in cross_entropy])

    
def logistic_regression(w, x, y, classes, epochs = 10, batch_samples = 1000, learning_rate=0.01, gd_type='mini-batch'):
    if gd_type == 'mini-batch':
        for epoch in range(epochs):
            epoch_range_min = len(x)/epochs * epoch
            epoch_range_max = len(x)/epochs * (epoch + 1)
            for j in range(batch_samples):
                i = random.randrange(epoch_range_min, epoch_range_max)
                p = softmax(get_phi(w, x[i]))
                grad = gradient(p, y[i], x[i])
                w = w - learning_rate * grad
    return w
        

features = tfidf_train.shape[1]
w_lr = np.zeros((classes, features))
w_lr_gd = logistic_regression(w_lr, X_train.toarray(), Y_train, classes)


In [151]:
print(w_lr)
print(w_lr_gd)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[ 0.18380241 -0.25509605  0.03210785 ... -0.00230967  0.
   0.00343579]
 [-0.339867    0.3818803   0.58371119 ...  0.01527438  0.
   0.00878101]
 [ 0.27910273  0.00357492 -0.30169064 ... -0.00515327  0.
  -0.00199606]
 [-0.12303813 -0.13035917 -0.3141284  ... -0.00781144  0.
  -0.01022074]]


In [None]:
neurons = 50
w = np.zeros((neurons, tfidf_train.shape[1]))
b = np.zeros()
tfT = tfidf_train.transpose()
print(tfidf_train.shape)
print(tfT.shape)
print(w.shape)
