# Machine Learning Text Classification
This code trains a HT model using SVM.SVC or Multinomial Naive Bayes.
This code convert each document into its vector and only sotres vectors into memory for efficiency.

In [36]:
import os, json
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
from scipy import sparse
from random import shuffle
from scipy.sparse import coo_matrix, vstack

vectorizer = CountVectorizer(min_df=1)
#89668 negative docs 
#203181 positive docs
count = 90000

### Find list of unique words from both negative and positive documents
To find unique words for each document, I used the word return from vectorizing, this methos may not be very efficient as we vectorize documents again in next step.

In [37]:
unique_words = []

def find_unique_word(words):
    for word in words:
        if word not in unique_words:
            unique_words.append(word)

def get_vector_words(text):
    vector = vectorizer.fit_transform([text])
    words_or_features = vectorizer.get_feature_names()
    return vector.toarray()[0], words_or_features

with open("cp1_negative_train_UPDATED.json", 'r') as f:
    c = 0
    for line in f:
        if c < count:
            extracted_text = json.loads(line)['extracted_text']
            vector, words = get_vector_words(extracted_text)
            find_unique_word(words)
            c += 1
            if c%20000 == 0:
                print "{0} negative processed (finding unique word)".format(c)
    f.close()

with open("CP1_train_ads.json", 'r') as f:
    c = 0
    for line in f:
        if c < count:
            extracted_text = json.loads(line)['extracted_text']
            vector, words = get_vector_words(extracted_text)
            find_unique_word(words)
            c += 1
            if c%20000 == 0:
                print "{0} positive processed (finding unique word)".format(c)
    f.close()

20000 negative processed (finding unique word)
40000 negative processed (finding unique word)
60000 negative processed (finding unique word)
80000 negative processed (finding unique word)
20000 positive processed (finding unique word)
40000 positive processed (finding unique word)
60000 positive processed (finding unique word)
80000 positive processed (finding unique word)


### Vectorize documents into same vector shape
Convert each document to vector with same shape ([num_unique]) and store with their lables (HT or not_HT)

In [38]:
all_data = []
num_unique = len(unique_words)

def add_vector_to_data(vector, words, num_unique, label):
    tmp = np.zeros(num_unique)
    for i, word in enumerate(words):
        try:
            index = unique_words.index(word.encode())
            tmp[index] = vector[i]
        except:
            pass
    all_data.append([sparse.csr_matrix(tmp), label])

with open("cp1_negative_train_UPDATED.json", 'r') as f:
    c = 0
    for line in f:
        if c < count:
            extracted_text = json.loads(line)['extracted_text']
            vector, words = get_vector_words(extracted_text)
            add_vector_to_data(vector, words, num_unique, 'not_ht')
            c += 1
            if c%20000 == 0:
                print "{0} negative processed (creating vector)".format(c)
    f.close()

with open("CP1_train_ads.json", 'r') as f:
    c = 0
    for line in f:
        if c < count:
            extracted_text = json.loads(line)['extracted_text']
            vector, words = get_vector_words(extracted_text)
            add_vector_to_data(vector, words, num_unique, 'ht')
            c += 1
            if c%20000 == 0:
                print "{0} postitve processed (creating vector)".format(c)
    f.close()

20000 negative processed (creating vector)
40000 negative processed (creating vector)
60000 negative processed (creating vector)
80000 negative processed (creating vector)
20000 postitve processed (creating vector)
40000 postitve processed (creating vector)
60000 postitve processed (creating vector)
80000 postitve processed (creating vector)


### Remove unnecessary unique words and shuffle vectors

In [39]:
del unique_words
shuffle(all_data)

### Separate vectors and labales

In [40]:
vectors = []
labels = []
for each in all_data:
    vectors.append(each[0])
    labels.append(each[1])
del all_data

### Separate training and test data in ratio of 80 to 20

In [41]:
len_labels = len(labels)
len_vectors = len(vectors)
if len_labels == len_vectors:
    train_num = int((80*len_labels)/100)
    test_num = len_labels - train_num
train_data = vectors[:train_num]
test_data = vectors[-1*(test_num):]
train_labels = labels[:train_num]
test_labels = labels[-1*(test_num):]

### Stack all training and test vectors together

In [42]:
train_vectors = vstack(train_data)
test_vectors = vstack(test_data)

### Train model using either SVM or NB
You can switch between SVC() or MultinomialNB()

In [43]:
clf = MultinomialNB().fit(train_vectors, train_labels)

In [44]:
predicted = clf.predict(test_vectors)

### Calculate confusion table

In [45]:
confusion = confusion_matrix(test_labels, predicted)
print confusion

[[15810  2199]
 [ 1847 16078]]


### Calculate score

In [46]:
score = f1_score(test_labels, predicted, pos_label="ht")
print score

0.886558627264


### Store trained model into python pickel for later use.

In [47]:
pickle.dump( clf, open( "ht_clf.p", "wb" ) )