In [85]:
import re
import json
import numpy as np
import datetime
from os import listdir

class Regs: 
    specialChars = '' 
    digits = '' 
    singleChars = ''
    multipleWhiteSpaces = ''
    stopWords = list()
stopWords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours","ers", "yourself", "yourselves", "he","isnt","cant" "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "wasnt", "were", "be", "been", "being", "have", "havent", "has", "had", "having", "do", "does", "doesnt", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

#Precompile regexes for better performance
def compileRegexes():
    regexes = Regs()
    regexes.specialChars = re.compile('[^\w\s]')
    regexes.digits = re.compile('\d')
    regexes.singleChars = re.compile('\s.\s')
    regexes.multipleWhiteSpaces = re.compile('[ ]{2,}')
    for sw in stopWords:
        exp = '\\b' + sw + '+\W'
        regexes.stopWords.append(re.compile(exp))
    return(regexes)

#Format the index into string with length 7
def formatIndex(index):
    i = str(index)
    while len(i) < 7:
        i = "0" + i
    return(i)

#Gather the texts from the source, based on starting index, amount of article and class
#=> returns array of texts and array of classes
def gatherTexts(startingIndex, amountOfArticles, includeClasses):
    if len(includeClasses) < 1:
        print("There must be at least 1 class. eg: [0,1,2]")
    
    #Create two arrays, which will be returned from this function
    texts = []
    types = []
    
    #Get data for class 0 => in our project it is FINANCE
    if 0 in includeClasses:
        articles = listdir("DATA/Finance")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Finance/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("0")   

    #Get data for class 1 => in our project it is SPORT
    if 0 in includeClasses:
        articles = listdir("DATA/Sports")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Sports/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("1")   
                    
    #Get data for class 2 => in our project it is TECHNOLOGY
    if 0 in includeClasses:
        articles = listdir("DATA/Technology")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Technology/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("2")   
                    
    #Get data for class 3 => in our project it is ENTERTAINMENT
    if 0 in includeClasses:
        articles = listdir("DATA/Entertainment")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Entertainment/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("3") 
                    
    return {'texts':texts, 'types':types}

def preprocessTexts(texts,types, regexes, usePorterStemmer = 0, removeUnfrequent = 0, removeFrequent = 0):    
    articles = []
    for index, text in enumerate(texts, start=0):
        articles.append(preprocessArticle(text,regexes,usePorterStemmer))
        
    #create one array fromm all articles
    words = [item for sublist in articles for item in sublist]
    #remove duplicate values from words list
    words = list(set(words))
    words = sorted(words)
    wordsDictionary = dict((v, i) for i, v in enumerate(words))
    
    articleWords = np.zeros((len(articles), len(words) + 1))
    for index, article in enumerate(articles, start=0):
        articleWords[index, 0 ] = types[index] 
        for j, word in enumerate(article, start=0):
            if word != '':
                articleWords[index, wordsDictionary[word] + 1] = 1   
                
    #Remove words with occurance = removeUnfrequent
    if removeUnfrequent >= 1:    
        indexes = []
        for word in wordsDictionary:
            if(sum(articleWords[:,wordsDictionary[word]]) > removeUnfrequent):
                indexes.append(wordsDictionary[word])
        ind = np.array(indexes)
        words = np.array(words)
        words = words[ind].tolist()
        articleWords = articleWords[:,np.insert(ind + 1, 0,0, axis=0)]
    
    #Remove words, which are the most frequent => TOP removeFrequent will be removed
    if removeFrequent > 0:
        usage = np.zeros(len(words)+1)
        for i in range(1, len(words)+1):
            usage[i] = sum(articleWords[:,i])
        for i in range(0, removeFrequent):
            words = np.delete(words,usage.tolist().index(max(usage)),0)
            articleWords = np.delete(articleWords, usage.tolist().index(max(usage)),1)
            usage = np.delete(usage,usage.tolist().index(max(usage)),0)
            
    return{'articleWords':articleWords, 'words':words}


#Create matrix containg all Phi values
#returns array(nClasses x amountOfWords)
def createPhiMatrix(articleWords, includeClasses):
    numberOfClasses = len(includeClasses)
    Y = np.zeros((numberOfClasses, articleWords.shape[1]))
    
    sumClasses = np.zeros((numberOfClasses))
    #number of rows clasified to first / second group
    for c, index in enumerate(includeClasses, start = 0):
        sumClasses[index] = float(len(articleWords[articleWords[:,0] == c,0]))

    #Laplace => alfa = 1
    alfa = 1

    #calcualting Phi_{y=0} and Phi_{y=1}
    for j in range(0, numberOfClasses):
        Y[j,0] = (sumClasses[j] + alfa) / float(articleWords.shape[0] + numberOfClasses * alfa)

    for j in range(1, articleWords.shape[1]):
        #calcualting Phi_{j|y=0} and Phi_{j|y=1}
        for k in range(0, numberOfClasses):
            Y[k,j] = (np.sum(articleWords[articleWords[:,0]==includeClasses[k],j]) + alfa) / (sumClasses[k] + numberOfClasses * alfa)
    
    return(Y)
    
#Preprocess the article with the precompiled regexes, and optional stemmer 
#=> returns the text splitted into array of words
def preprocessArticle(text, regexes, usePorterStemmer):
    text = text.lower()
    #new lines
    text = text.replace('\n', ' ')
    text = text.replace('_', ' ')
    #special characters
    text = re.sub(regexes.specialChars, ' ', text)
    #digits
    text = re.sub(regexes.digits, '', text)
    #stopwords
    for sw in regexes.stopWords:
        text = re.sub(sw , '', text)
    #single characters (ex donald j trump => donald trump)
    text = re.sub(regexes.singleChars, ' ', text)
    #multiple white spaces
    text = re.sub(regexes.multipleWhiteSpaces, ' ', text)
    if usePorterStemmer == 1:
        splitted = text.split()
        for index, word in enumerate(splitted, start=0):
            splitted[index] = ps.stem(word)
        return splitted
    return(text.split())


#Make prediction based on 
def predict(phiMatrix, text,words, regexes, usePorterStemmer):
    nClasses = phiMatrix.shape[0]
    Y = phiMatrix
    textPorcessed = preprocessArticle(text,regexes,usePorterStemmer)
    testArticleWords = np.zeros((1, len(words)))
    for k, word in enumerate(textPorcessed, start=0):
            if word in words:
                testArticleWords[0, words.index(word)] = 1

    P = np.zeros(nClasses)
    for j in range(0, nClasses):
        P[j] = np.log(Y[j,0]) + ((np.log(np.array(Y[j,1:]*testArticleWords)[testArticleWords == 1]))).sum() + ((np.log(np.array((1 - Y[j,1:])*(testArticleWords - 1)*-1)[testArticleWords == 0]))).sum()

    classPossibilities = np.zeros(4)
    for i in range(0,phiMatrix.shape[0]):
        classPossibilities[i] = 1 / (sum(np.exp(P-P[i])))
    
    return(np.argmax(classPossibilities))

print("Done")
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))

Done
2018-11-08 22:03


In [69]:
ind = str(5)

while len(ind) < 5:
    ind = "0" + ind
    
print(ind)

00005


In [5]:
 for i in range(1, 5):
    print(i)

1
2
3
4


In [83]:
result =  gatherTexts(1,10,np.array([0,1,2,3]))
regs = compileRegexes()
r2 = preprocessTexts(result["texts"],result["types"],regs)
phi = createPhiMatrix(r2["articleWords"],np.array([0,1,2,3]))

In [66]:
print(sum(phi[0,:]))

373.4642857142895


In [67]:
phi.shape[0]

4