In [59]:
import re
import json
import numpy as np
import datetime
import time
from nltk.stem import PorterStemmer
from os import listdir

class Regs: 
    specialChars = '' 
    digits = '' 
    singleChars = ''
    multipleWhiteSpaces = ''
    stopWords = list()
stopWords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours","ers", "yourself", "yourselves", "he","isnt","cant" "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "wasnt", "were", "be", "been", "being", "have", "havent", "has", "had", "having", "do", "does", "doesnt", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
ps = PorterStemmer()

#Precompile regexes for better performance
def compileRegexes():
    regexes = Regs()
    regexes.specialChars = re.compile('[^\w\s]')
    regexes.digits = re.compile('\d')
    regexes.singleChars = re.compile('\s.\s')
    regexes.multipleWhiteSpaces = re.compile('[ ]{2,}')
    for sw in stopWords:
        exp = '\\b' + sw + '+\W'
        regexes.stopWords.append(re.compile(exp))
    return(regexes)

#Format the index into string with length 7
def formatIndex(index):
    i = str(index)
    while len(i) < 7:
        i = "0" + i
    return(i)

#Gather the texts from the source, based on starting index, amount of article and class
#=> returns array of texts and array of classes
def gatherTexts(startingIndex, amountOfArticles, includeClasses):
    if len(includeClasses) < 1:
        print("There must be at least 1 class. eg: [0,1,2]")
    
    #Create two arrays, which will be returned from this function
    texts = []
    types = []
    
    #Get data for class 0 => in our project it is FINANCE
    if 0 in includeClasses:
        articles = listdir("DATA/Finance")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Finance/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("0")   

    #Get data for class 1 => in our project it is SPORT
    if 1 in includeClasses:
        articles = listdir("DATA/Sports")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Sports/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("1")   
                    
    #Get data for class 2 => in our project it is TECHNOLOGY
    if 2 in includeClasses:
        articles = listdir("DATA/Technology")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Technology/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("2")   
                    
    #Get data for class 3 => in our project it is ENTERTAINMENT
    if 3 in includeClasses:
        articles = listdir("DATA/Entertainment")
        nArticles = len(articles)
        #check if there are enough articles for the input paramteres
        if (nArticles > startingIndex + amountOfArticles) :
            for ind in range(startingIndex, startingIndex + amountOfArticles):
                with open("DATA/Entertainment/news_" + formatIndex(ind) + ".json", encoding="utf8") as json_data:
                    texts.append(json.load(json_data)["text"])
                    types.append("3") 
                    
    return {'texts':texts, 'types':types}

def preprocessTexts(texts,types, regexes, usePorterStemmer = 0, removeUnfrequent = 0, removeFrequent = 0):    
    articles = []
    for index, text in enumerate(texts, start=0):
        articles.append(preprocessArticle(text,regexes,usePorterStemmer))
        
    #create one array fromm all articles
    words = []
    words = [item for sublist in articles for item in sublist]
    #remove duplicate values from words list
    words = list(set(words))
    words = sorted(words)
    wordsDictionary = dict((v, i) for i, v in enumerate(words))
    
    articleWords = np.zeros((len(articles), len(words) + 1))
    for index, article in enumerate(articles, start=0):
        articleWords[index, 0 ] = types[index] 
        for j, word in enumerate(article, start=0):
            if word != '':
                articleWords[index, wordsDictionary[word] + 1] = 1   
                
    #Remove words with occurance = removeUnfrequent
    if removeUnfrequent >= 1:    
        indexes = []
        for word in wordsDictionary:
            if(sum(articleWords[:,wordsDictionary[word]]) > removeUnfrequent):
                indexes.append(wordsDictionary[word])
        ind = np.array(indexes)
        words = np.array(words)
        words = words[ind].tolist()
        articleWords = articleWords[:,np.insert(ind + 1, 0,0, axis=0)]
    
    #Remove words, which are the most frequent => TOP removeFrequent will be removed
    if removeFrequent > 0:
        usage = np.zeros(len(words)+1)
        for i in range(1, len(words)+1):
            usage[i] = sum(articleWords[:,i])
        for i in range(0, removeFrequent):
            words = np.delete(words,usage.tolist().index(max(usage)),0)
            articleWords = np.delete(articleWords, usage.tolist().index(max(usage)),1)
            usage = np.delete(usage,usage.tolist().index(max(usage)),0)
            
    return{'articleWords':articleWords, 'words':words}


#Create matrix containg all Phi values
#returns array(nClasses x amountOfWords)
def createPhiMatrix(articleWords, includeClasses):
    numberOfClasses = len(includeClasses)
    Y = np.zeros((numberOfClasses, articleWords.shape[1]))
    
    sumClasses = np.zeros((numberOfClasses))
    #number of rows clasified to first / second group
    for c, index in enumerate(includeClasses, start = 0):
        sumClasses[index] = float(len(articleWords[articleWords[:,0] == c,0]))

    #Laplace => alfa = 1
    alfa = 1

    #calcualting Phi_{y=0} and Phi_{y=1}
    for j in range(0, numberOfClasses):
        Y[j,0] = (sumClasses[j] + alfa) / float(articleWords.shape[0] + numberOfClasses * alfa)

    for j in range(1, articleWords.shape[1]):
        #calcualting Phi_{j|y=0} and Phi_{j|y=1}
        for k in range(0, numberOfClasses):
            Y[k,j] = (np.sum(articleWords[articleWords[:,0]==includeClasses[k],j]) + alfa) / (sumClasses[k] + numberOfClasses * alfa)
    
    return(Y)
    
#Preprocess the article with the precompiled regexes, and optional stemmer 
#=> returns the text splitted into array of words
def preprocessArticle(text, regexes, usePorterStemmer):
    text = text.lower()
    #new lines
    text = text.replace('\n', ' ')
    text = text.replace('_', ' ')
    #special characters
    text = re.sub(regexes.specialChars, ' ', text)
    #digits
    text = re.sub(regexes.digits, '', text)
    #stopwords
    for sw in regexes.stopWords:
        text = re.sub(sw , '', text)
    #single characters (ex donald j trump => donald trump)
    text = re.sub(regexes.singleChars, ' ', text)
    #multiple white spaces
    text = re.sub(regexes.multipleWhiteSpaces, ' ', text)
    if usePorterStemmer == 1:
        splitted = text.split()
        for index, word in enumerate(splitted, start=0):
            splitted[index] = ps.stem(word)
        return splitted
    return(text.split())


#Make prediction based on 
def predict(phiMatrix, text,words, regexes, usePorterStemmer):
    nClasses = phiMatrix.shape[0]
    Y = phiMatrix
    textPorcessed = preprocessArticle(text,regexes,usePorterStemmer)
    testArticleWords = np.zeros((1, len(words)))
    for k, word in enumerate(textPorcessed, start=0):
            if word in words:
                testArticleWords[0, words.index(word)] = 1

    P = np.zeros(nClasses)
    for j in range(0, nClasses):
        P[j] = np.log(Y[j,0]) + ((np.log(np.array(Y[j,1:]*testArticleWords)[testArticleWords == 1]))).sum() + ((np.log(np.array((1 - Y[j,1:])*(testArticleWords - 1)*-1)[testArticleWords == 0]))).sum()

    classPossibilities = np.zeros(4)
    for i in range(0,phiMatrix.shape[0]):
        classPossibilities[i] = 1 / (sum(np.exp(P-P[i])))
    
    return(np.argmax(classPossibilities))

print("Done")
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M"))

Done
2018-11-24 09:32


In [4]:
rawTexts =  gatherTexts(1,100,np.array([0,1,2,3]))
regs = compileRegexes()
preprocessedTexts = preprocessTexts(rawTexts["texts"],rawTexts["types"],regs)
phi = createPhiMatrix(preprocessedTexts["articleWords"],np.array([0,1,2,3]))
with open("DATA/Sport/news_0002100.json", encoding="utf8") as json_data:
    #print(index)
    testtext = json.load(json_data)["text"]
    start = time.time()
    print(predict(phi,testtext,preprocessedTexts["words"],regs,0))
    end = time.time()
    print(end-start)

1
0.015621185302734375


In [78]:
preprocessedTexts["articleWords"][:,np.insert(np.array(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==0,1:], axis=0) != 0),0, False)][preprocessedTexts["articleWords"][:,0]==0,:]


#print(len(preprocessedTexts["articleWords"][1,:]))

array([[0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1.,
        0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
        1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0.,
        0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
        0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1.,
        0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 0., 0.,
        0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1.,
        1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0.,
        1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1.,
        1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 

In [56]:
#print(preprocessedTexts["articleWords"])
rawTexts =  gatherTexts(1,50,np.array([0,1,2,3]))
#regs = compileRegexes()
preprocessedTexts = preprocessTexts(rawTexts["texts"],rawTexts["types"],regs)
#for i in np.array([0,1]):
    #print(np.mean(np.sum(preprocessedTexts["articleWords"][:,np.insert(np.array(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==i,1:], axis=0) != 0),0, False)][preprocessedTexts["articleWords"][:,0]==i,:], axis=0)))
    #print(np.sum(np.sum(preprocessedTexts["articleWords"][:,np.insert(np.array(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==i,1:], axis=0) != 0),0, False)][preprocessedTexts["articleWords"][:,0]==i,:], axis=0) == 1))
    #print(np.sum(np.sum(preprocessedTexts["articleWords"][:,np.insert(np.array(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==i,1:], axis=0) != 0),0, False)][preprocessedTexts["articleWords"][:,0]==i,:], axis=0) == 2))
    #print(np.mean(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==i,1:], axis=1)))

articleWords = preprocessedTexts["articleWords"]
#print(articleWords)

wordsOfGrous0 = np.sum(articleWords[articleWords[:,0] == 0,1:], axis=0)
wordsOfGrous1 = np.sum(articleWords[articleWords[:,0] == 1,1:], axis=0)

print(np.array((np.where( np.logical_and( wordsOfGrous0 > 0, wordsOfGrous1 > 0 ) ))).size)

1230


In [27]:
with open("DATA/Finance/news_0002010.json", encoding="utf8") as json_data:
    #print(index)
    testtext = json.load(json_data)["text"]
    start = time.time()
    print(predict(phi,testtext,preprocessedTexts["words"],regs,0))
    end = time.time()
    print(end-start)

NameError: name 'phi' is not defined

In [81]:
#print(preprocessedTexts["articleWords"])
rawTexts =  gatherTexts(1,100,np.array([1]))
regs = compileRegexes()
preprocessedTexts = preprocessTexts(rawTexts["texts"],rawTexts["types"],regs)
print(np.mean(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==1,1:], axis=0)))
print(np.mean(np.sum(preprocessedTexts["articleWords"][preprocessedTexts["articleWords"][:,0]==1,1:], axis=1)))

2.5194274028629855
98.56


In [57]:
def createPropertiesOfTrainingSet(startIndex, endIndex, usePorterStemmer = 0, removeUnfrequent = 0, removeFrequent = 0):
    #STEPS:
    #---1. Gather texts from all groups
    #---2. Preprocess them
    #---3. Create properties:
    #------- Average number of words in the texts for each group
    #------- Average texts where each word 
    
    print("Starting the method for creating properties...")
    #Needed variables:    
    regs = compileRegexes()
    
    #Get and preprocess texts 
    print("Starting the gathering of texts from " + str(startIndex) + " to " + str(endIndex) + "...")
    start = time.time()
    rawTexts =  gatherTexts(startIndex,endIndex,np.array([0,1,2,3]))
    end = time.time()
    print("Gathering finished in " + str(end-start) + " ms")
    print("Starting the preprocess...")
    start = time.time()
    preprocessedTexts = preprocessTexts(rawTexts["texts"],rawTexts["types"],regs, usePorterStemmer, removeUnfrequent, removeFrequent)
    end = time.time()
    print("Preprocessment finished in " + str(end-start) + " ms")
    
    print("Starting the properties calculation for each group...")
    start = time.time()
    averageAmountOfWords = np.zeros(4)
    averageUsageOfWords = np.zeros(4)
    amountOfRareWords1 = np.zeros(4)
    amountOfRareWords2 = np.zeros(4)
    totalAmountOfWords = np.zeros(4)
    
    articleWords = preprocessedTexts["articleWords"]
    
    for i in np.array([0,1,2,3]):        
        averageUsageOfWords[i] = np.mean(np.sum(articleWords[:,np.insert(np.array(np.sum(articleWords[articleWords[:,0]==i,1:], axis=0) != 0),0, False)][articleWords[:,0]==i,:], axis=0))
        averageAmountOfWords[i] = np.mean(np.sum(articleWords[articleWords[:,0]==i,1:], axis=1))
        amountOfRareWords1[i] = np.sum(np.sum(articleWords[:,np.insert(np.array(np.sum(articleWords[articleWords[:,0]==i,1:], axis=0) != 0),0, False)][articleWords[:,0]==i,:], axis=0) == 1)
        amountOfRareWords2[i] = np.sum(np.sum(articleWords[:,np.insert(np.array(np.sum(articleWords[articleWords[:,0]==i,1:], axis=0) != 0),0, False)][articleWords[:,0]==i,:], axis=0) == 2)
        totalAmountOfWords[i] = len(np.sum(articleWords[:,np.insert(np.array(np.sum(articleWords[articleWords[:,0]==i,1:], axis=0) != 0),0, False)][articleWords[:,0]==i,:], axis=0))

    end = time.time()
    print("Properties for each group finished in " + str(end-start) + " ms")
    
    print("Starting counting the common words trough the groups...")
    commonWords = np.zeros((4,4))
    start = time.time()
    for i in np.array([0,1,2,3]):
        for j in np.array([0,1,2,3]): 
            wordsOfGrous0 = np.sum(articleWords[articleWords[:,0] == i,1:], axis=0)
            wordsOfGrous1 = np.sum(articleWords[articleWords[:,0] == j,1:], axis=0)
            commonWords[i,j] = np.array((np.where( np.logical_and( wordsOfGrous0 > 0, wordsOfGrous1 > 0 ) ))).size
    end = time.time()
    print("Common words counting finished in " + str(end-start) + " ms")

    return{'averageAmountOfWords':averageAmountOfWords, 
           'averageUsageOfWords':averageUsageOfWords,
           'amountOfRareWords1':amountOfRareWords1,
           'amountOfRareWords1Percentage':amountOfRareWords1 / totalAmountOfWords,
           'amountOfRareWords2':amountOfRareWords2,
           'amountOfRareWords2Percentage':amountOfRareWords2 / totalAmountOfWords,
           'totalAmountOfWords':totalAmountOfWords,
           'commonWords':commonWords}

In [55]:
props = createPropertiesOfTrainingSet(1,50)
print(props)

Starting the method for creating properties...
Starting the gathering of texts from 1 to 50...
Gathering finished in 1.7856512069702148 ms
Starting the preprocess...
Preprocessment finished in 5.527027130126953 ms
Starting the properties calculation for each group...
Properties for each group finished in 0.2899503707885742 ms
Starting counting the common words trough the groups...
Common words counting finished in 0.16361737251281738 ms
{'averageAmountOfWords': array([192.08, 120.78, 176.78, 141.96]), 'averageUsageOfWords': array([2.01341719, 2.17074047, 1.870292  , 1.80610687]), 'amountOfRareWords1': array([3070., 1804., 3197., 2716.]), 'amountOfRareWords1Percentage': array([0.64360587, 0.64845435, 0.67647059, 0.69109415]), 'amountOfRareWords2': array([739., 374., 712., 553.]), 'amountOfRareWords2Percentage': array([0.15492662, 0.13443566, 0.15065595, 0.14071247]), 'totalAmountOfWords': array([4770., 2782., 4726., 3930.]), 'commonWords': array([[4770., 1230., 2095., 1464.],
       [12

In [60]:
props = createPropertiesOfTrainingSet(1,50,1)
print(props)

Starting the method for creating properties...
Starting the gathering of texts from 1 to 50...
Gathering finished in 0.19225478172302246 ms
Starting the preprocess...
Preprocessment finished in 5.4446494579315186 ms
Starting the properties calculation for each group...
Properties for each group finished in 0.17907261848449707 ms
Starting counting the common words trough the groups...
Common words counting finished in 0.1216425895690918 ms
{'averageAmountOfWords': array([175.06, 114.22, 162.86, 135.04]), 'averageUsageOfWords': array([2.52466109, 2.47873264, 2.31072645, 2.11528822]), 'amountOfRareWords1': array([2002., 1400., 2161., 2025.]), 'amountOfRareWords1Percentage': array([0.57744448, 0.60763889, 0.61322361, 0.6343985 ]), 'amountOfRareWords2': array([531., 314., 561., 482.]), 'amountOfRareWords2Percentage': array([0.15315835, 0.13628472, 0.1591941 , 0.15100251]), 'totalAmountOfWords': array([3467., 2304., 3524., 3192.]), 'commonWords': array([[3467., 1145., 1742., 1373.],
       [

In [None]:
props = createPropertiesOfTrainingSet(1,50,1,1)
print(props)

Starting the method for creating properties...
Starting the gathering of texts from 1 to 50...
Gathering finished in 0.15046310424804688 ms
Starting the preprocess...
