In [69]:
import os
import re 
import numpy as np
from string import punctuation
import pickle
import math
import random
import time

In [70]:
'''
    @ogil7190:
     - We read files from a folder (assuming located in same directory as this is in)
         - file structure 
             text_data:
                 /: athism
                 /: electronics
                 .
                 .
     - We split files into test and train 
     - we calculate word list based on train_data
         - during training we move to each file, first clean headers ( logic for header removing is that Word 'Lines:' exists in almost every file, we remove everything on till that line)
         - get word list for that file using list_word_from_file()
         - compare against stop words 
         - store words
    - we need to calculate prior probabilites also, for that, we need :: word, how many times encountered in a folder,  how many total words in from that folder and count of files scanned in that folder
    - use laplace correction for smoothing
    - calculate probabilities for each word given a class
    - use predict() to predict on a file
    - use predict_bulk()  to predict test_data we prepared
    - use print_word_list() helper to print word_list to see how data is coming.
    
    structure of word list :
    
    { word : { class : frequencies, ... , prob : { class : probabililty}}
    each word contain frequencies against each class, and probability p (word | class) :: p of word given a class
    
    prior list keep count of files scanned and total words of each class
'''

"\n    @ogil7190:\n     - We read files from a folder (assuming located in same directory as this is in)\n         - file structure \n             text_data:\n                 /: athism\n                 /: electronics\n                 .\n                 .\n     - We split files into test and train \n     - we calculate word list based on train_data\n         - during training we move to each file, first clean headers ( logic for header removing is that Word 'Lines:' exists in almost every file, we remove everything on till that line)\n         - get word list for that file using list_word_from_file()\n         - compare against stop words \n         - store words\n    - we need to calculate prior probabilites also, for that, we need :: word, how many times encountered in a folder,  how many total words in from that folder and count of files scanned in that folder\n    - use laplace correction for smoothing\n    - calculate probabilities for each word given a class\n    - use predict

In [71]:
DATA_FOLDER = "/20_newsgroup/" #relative path of folder newsgroup
LIMIT = -1 # how many files to read from one folder, -1 is for all
ALPHA = 0.001 # Laplace correction alpha for smoothing
TEST_SIZE_PERCENT = 0.1 # how much to scale test data size 20%, 30 % etc.
PRUNE_TOP_WORD_COUNT = 25 # Remove top keys as these keys will remove variance among data while taking porbabilites
stop_words = ["article", "writes", "a", "about", "above", "across", "after", "afterwards","again", "all", "almost", "alone", "along", "already", "also","although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "as", "at", "be", "became", "because", "become","becomes", "becoming", "been", "before", "behind", "being", "beside", "besides", "between", "beyond", "both", "but", "by","can", "cannot", "cant", "could", "couldnt", "de", "describe", "do", "done", "dont", "don't", "each", "eg", "either", "else", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "find","for","found", "four", "from", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", "in", "indeed", "is", "it", "its", "itself", "keep", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mine", "more", "moreover", "most", "mostly", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next","no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part","perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "she", "should","since", "sincere","so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "take","than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they","this", "those", "though", "through", "throughout","thru", "thus", "to", "together", "too", "toward", "towards","under", "until", "up", "upon", "us","very", "was", "we", "well", "were", "what", "whatever", "when","whence", "whenever", "where", "whereafter", "whereas", "whereby","wherein", "whereupon", "wherever", "whether", "which", "while", "who", "whoever", "whom", "whose", "why", "will", "with","within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"]
#stop_words.append(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '<', '>', '', 'article', 'writes']) 

In [72]:
def get_files(path):
    files = list()
    file_list = os.listdir(os.getcwd()+path)
    for filename in file_list:
        files.append(filename)
    return files

In [73]:
def clean(s):
    return ''.join( c for c in s if c not in punctuation)

In [74]:
def list_words(files, FOLDER_LOC):
    my_word_list = {}
    prior = {}
    count = 0
    files_count = 0
    for f in files.keys():
        try:
            prior[f]
        except:
            prior[f] = {}
        for k in files[f]:
            files_count += 1
            try:
                prior[f]['files_count'] += 1
            except:
                prior[f]['files_count'] = 1
            file_path = FOLDER_LOC + f + "/" + k
            my_words = list_word_from_file(file_path)
            for w in my_words:
                try:
                    prior[f]['words_count'] += 1
                except:
                    prior[f]['words_count'] = 1
                try:
                    val = my_word_list[w] 
                    val['total_count'] += 1
                    try:
                        val[f] += 1
                    except:
                        val[f] = 1
                except:
                    my_word_list[w] = {'total_count' : len(files.keys()) + 1 }
                    for x in files.keys():
                        my_word_list[w][x] = ALPHA
                    my_word_list[w][f] += 1
        count += 1
        print('Processed:', count, 'Folders out of:', len(files.keys()))
    prior['total_count'] = files_count
    return my_word_list, prior


def list_word_from_file(file_path):
    my_words = list()
    with open(file_path[1:], 'r', errors='ignore') as doc:
                pos = False
                for line in doc:
                    if pos:
                        break_line = line.lower().split()
                        for x in break_line:
                            w = clean(x) #clean a word, sometimes contain other symbols
                            if not re.search("\d", w) and len(w) > 2: # word contain any number, then drop it
                                if w not in stop_words: # if it is a stop word, drop it
                                    my_words.append(w)
                    try:
                        line.index('Lines:') # found end of header in email
                        pos = True
                    except:
                        print('',end='')
    return my_words

In [75]:
def save(my_word_list, name):
    with open(name, 'wb') as fp:
        pickle.dump(my_word_list, fp, protocol=pickle.HIGHEST_PROTOCOL)

def load(name):
    with open(name, 'rb') as fp:
        data = pickle.load(fp)
        return data

In [76]:
def print_word_list(word_list, limit = -1):
    for k in list(word_list.keys())[0:limit]:
        print(k, word_list[k])

In [77]:
def removeHFData(my_word_list):
    for i in range(PRUNE_TOP_WORD_COUNT):
        my_word_list = prune(my_word_list)
    return my_word_list

def prune(my_words):
    max_key = '-1'
    max_len = 0
    for i in my_words:
        if max_len < my_words[i]['total_count']:
            max_len = my_words[i]['total_count']
            max_key = i
    del my_words[max_key]
    return my_words

In [78]:
def split_data(SIZE, files, shuffle = True):
    train = {}
    test = {}
    for k in files:
        file_list = files[k]
        if shuffle:
            random.shuffle(file_list)
        test_size = math.ceil(SIZE * len(file_list))
        train_size = len(file_list) - test_size
        train[k] = file_list[0: train_size]
        test[k] = file_list[train_size : train_size + test_size]
    return train, test

In [79]:
def read_data(FOLDER_LOC, FILES_LIMIT = 10):
    folders = get_files(FOLDER_LOC)
    files = {}
    for i in folders:
        if not i.startswith('.'): # not a file
            files[i] = get_files(FOLDER_LOC + i)[0:FILES_LIMIT]
    return files

In [80]:
def fit(files, FOLDER_LOC):
    start_time = time.time()
    words = list_words(files, FOLDER_LOC)
    finish_time = time.time()
    print('Time Taken:', finish_time - start_time, 'seconds')
    return words

In [81]:
def populate_prob(words_list, prior): #calculate prior probability for each word against each class
    for w in words_list:
        for k in list(words_list[w].keys()):
            if k == 'total_count':
                continue
            prob = words_list[w][k] / ( prior[k]['words_count'] + len(words_list))
            try:
                words_list[w]['prob'][k] = prob
            except:
                words_list[w]['prob'] = { k : prob }
    return words_list

In [82]:
def predict(data, prior, file, FILE_LOC):
    words = list_word_from_file(FILE_LOC + '/' + file)
    prob = dict()
    for w in words:
        try:
            word_data = data[w]
            for k in word_data:
                if k == 'total_count':
                    continue
                try:
                    prob[k] += np.log(word_data['prob'][k])
                except:
                    prob[k] = np.log(prior[k]['files_count'] / (ALPHA * prior['total_count'])) + np.log(word_data['prob'][k])
        except:
            print('', end = '')
    return prob

def predict_bulk(my_test_data, words_list, print_log = True): # return accuracy only
    acc = 0
    tot = 0
    for k in my_test_data:
        tot += 1
        count = 0
        for i in range(len(my_test_data[k])):
            max_val = - float("inf")
            max_i = ''
            ans = predict(words_list, prior, my_test_data[k][i], DATA_FOLDER + k)
            for x in ans:
                if max_val < ans[x]:
                    max_val = ans[x]
                    max_i = x
            if k == max_i:
                count += 1
        if print_log:
            print('Correctly Predicted:', count, 'Total:', len(my_test_data[k]), 'Class:', k)
        acc += count / len(my_test_data[k])
    acc = (acc / tot) * 100
    if print_log:
        print('Accuracy:', acc,'%')
    return acc

In [83]:
my_files_list = read_data(DATA_FOLDER, LIMIT) #list of files against folder, type : dictionary
train_data, test_data = split_data(TEST_SIZE_PERCENT, my_files_list) # split data into train and testing, type : dictionary
my_words_list, prior = fit(train_data, DATA_FOLDER) # fit on training data, type : dictionary
my_words_list = populate_prob(my_words_list, prior) # calculate prior probabilites, type : dictionary

Processed: 1 Folders out of: 20
Processed: 2 Folders out of: 20
Processed: 3 Folders out of: 20
Processed: 4 Folders out of: 20
Processed: 5 Folders out of: 20
Processed: 6 Folders out of: 20
Processed: 7 Folders out of: 20
Processed: 8 Folders out of: 20
Processed: 9 Folders out of: 20
Processed: 10 Folders out of: 20
Processed: 11 Folders out of: 20
Processed: 12 Folders out of: 20
Processed: 13 Folders out of: 20
Processed: 14 Folders out of: 20
Processed: 15 Folders out of: 20
Processed: 16 Folders out of: 20
Processed: 17 Folders out of: 20
Processed: 18 Folders out of: 20
Processed: 19 Folders out of: 20
Processed: 20 Folders out of: 20
Time Taken: 15.47261905670166 seconds


In [84]:
#my_words_list = removeHFData(my_words_list) # use it to remove highly frequent data from list mostly un-useful words
save(my_words_list, 'MyWordList.p')

In [85]:
predict_bulk(test_data, my_words_list)

Correctly Predicted: 9 Total: 10 Class: talk.politics.mideast
Correctly Predicted: 5 Total: 10 Class: rec.autos
Correctly Predicted: 7 Total: 10 Class: comp.sys.mac.hardware
Correctly Predicted: 6 Total: 10 Class: alt.atheism
Correctly Predicted: 6 Total: 10 Class: rec.sport.baseball
Correctly Predicted: 4 Total: 10 Class: comp.os.ms-windows.misc
Correctly Predicted: 9 Total: 10 Class: rec.sport.hockey
Correctly Predicted: 8 Total: 10 Class: sci.crypt
Correctly Predicted: 9 Total: 10 Class: sci.med
Correctly Predicted: 9 Total: 10 Class: talk.politics.misc
Correctly Predicted: 10 Total: 10 Class: rec.motorcycles
Correctly Predicted: 5 Total: 10 Class: comp.windows.x
Correctly Predicted: 7 Total: 10 Class: comp.graphics
Correctly Predicted: 3 Total: 10 Class: comp.sys.ibm.pc.hardware
Correctly Predicted: 4 Total: 10 Class: sci.electronics
Correctly Predicted: 6 Total: 10 Class: talk.politics.guns
Correctly Predicted: 8 Total: 10 Class: sci.space
Correctly Predicted: 8 Total: 10 Class: s

65.5