Import necessary library

In [1]:

from xml.dom import minidom
import re
import numpy as np
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import math


Data Cleaning Functions

In [2]:

TAG_RE = re.compile(r'<[^>]+>|\'')
def remove_tags(text):
    return TAG_RE.sub('', text)

PUNC_TAG = re.compile(r'[^a-zA-Z0-9_]')
def remove_punctuation(text):
    return PUNC_TAG.sub(' ', text)

Spaces = re.compile(r'  *')
def remove_spaces(text):
    return Spaces.sub(' ', text)

def remove_junk(string):
    return (remove_spaces(remove_punctuation(remove_tags(string)))).lower()


loading stop words in a list

In [3]:

# https://github.com/Alir3z4/stop-words/blob/master/english.txt
file = open('english.txt', 'r')
stopwords = []

for i in file.readlines():
    i = i.split()
    stopwords = stopwords + i

file.close()
# other stopwords collections
# https://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/


<h5>Word Stemming and Lemmatization Functions</h5>
<a href="https://gist.github.com/mmmayo13/07252b4eb27e5495b6032888b38e5333#file-text_data_preprocessing_5-py" target="_blank">more on this link</a>

In [4]:

# https://gist.github.com/mmmayo13/07252b4eb27e5495b6032888b38e5333#file-text_data_preprocessing_5-py
def stem_word(word):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()   
    return stemmer.stem(word)

def lemmatize_verb(word):
    """Lemmatize verbs"""
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word, pos='v')

def lemm(word):
    """Lemmatize nouns"""
    wordnet_lemmatizer = WordNetLemmatizer()
    return wordnet_lemmatizer.lemmatize(word)

def pot(word):
    porter_stemmer = PorterStemmer()
    return porter_stemmer.stem(word)

In [5]:
def build_wordlist(filelocation, stopwords, wordmap, index, MAX_ROWS = 200):
    
    # parse an xml file by name
    mydoc = minidom.parse(filelocation)
    
    # get each row in the file
    items = mydoc.getElementsByTagName('row')
    count=0
    
    for item in items:
        
        # remove unnecessary things
        string = remove_junk(item.attributes['Body'].value)
        
        # not count a paragraph without any value
        if len(string) < 2:
            continue
        
        count=count+1
        if count==MAX_ROWS:
            break
            
        
        # check every single words in the string
        words = string.split(" ")
        
        for word in words:
            
            # check whether it's an important word or not
            # https://docs.python.org/3/library/stdtypes.html#str.isnumeric
            if not word.isnumeric() and len(word) > 1:
                # lemmatize the words 
                word = lemm(lemmatize_verb(word))
                
                if word not in wordmap and word not in stopwords:
                    
                    wordmap[word] = index
                    index = index + 1

    
    return wordmap, index

In [6]:
def vectormapping_train(filelocation, wordmap, MAX_ROWS = 200):

    count = 0
    mainvec =[]
    
    testdoc = minidom.parse(filelocation)
    testItems = testdoc.getElementsByTagName('row')

    for testItem in testItems:
        
        # initialize the vector
        vector = [0]*len(wordmap)
        
        # okay, remove the junk as before
        string = remove_tags(testItem.attributes['Body'].value)
        string = remove_punctuation(string)
        string = remove_spaces(string)
        string = string.lower()
        
        # not count a paragraph without any value
        if len(string) < 2:
            continue
        
        count=count+1
        if count==MAX_ROWS:
            break

        # check every words in the string
        words = string.split(" ")
        
        # now go through every word
        for w in words:
            # lemmatize the word
            if not w.isnumeric() and len(w) > 1 and w not in stopwords:

                w = lemm(lemmatize_verb(w))

                # count how many times it's in the string
                if w in wordmap.keys():
                    vector[wordmap[w]]=vector[wordmap[w]]+1

        # append this vector of a single row to the whole vector list
        mainvec.append(vector)

    return mainvec

In [7]:
def vectormapping_test(filelocation, wordmap, MAX_ROWS = 200):

    count = 0
    mainvec = []
    notinwordmap = []
    extwordmap = []
    
    testdoc = minidom.parse(filelocation)
    testItems = testdoc.getElementsByTagName('row')
    
    for testItem in testItems:
        
        # initialize the vector
        vector = [0]*len(wordmap)
        notinmap = []
        extmap = []
        
        # okay, remove the junk as before
        string = remove_tags(testItem.attributes['Body'].value)
        string = remove_punctuation(string)
        string = remove_spaces(string)
        string = string.lower()
        
        # not count a paragraph without any value
        if len(string) < 2:
            continue
        
        count=count+1
        if count==MAX_ROWS:
            break
        
        # check every words in the string
        words = string.split(" ")
        
        # now go through every word
        for w in words:
            
            # lemmatize the word
            if not w.isnumeric() and len(w) > 1 and w not in stopwords:
                w = lemm(lemmatize_verb(w))
                # print(w)
            
            # count how many times it's in the string
                if w in wordmap.keys():
                    vector[wordmap[w]]=vector[wordmap[w]]+1
                else:
                    notinmap.append(w)
        
        # https://stackoverflow.com/questions/12282232/how-do-i-count-unique-values-inside-a-list
        # extmap = list(set(notinmap))
        extmap = (np.unique(notinmap, return_counts=True)[0]).tolist()
        notinmap = (np.unique(notinmap, return_counts=True)[1]).tolist()

        # append this vector of a single row to the whole vector list
        mainvec.append(vector)
        notinwordmap.append(notinmap)
        extwordmap.append(extmap)
        
        
    return mainvec, notinwordmap, extwordmap

In [8]:
# if we need the total wordmap in a list....
def features_name(wordmap):

    features = [0] * len(wordmap)

    for i in range(len(wordmap)):
            for w, c in wordmap.items():
                if i == c:
                    features[i] = w
    return features

# Wordmap

In [9]:
filenames = open('./Dataset/topics.txt', 'r')
names = []

for i in filenames.readlines():
    names = names + i.split()

print(names)

wordmap = {}
index = 0
MAX_ROWS = 80

# build the total wordMap of all the files and rows
for name in names:
    fileloc = './Dataset/Training/' + name + '.xml'
    wordmap, index = build_wordlist(fileloc, stopwords, wordmap, index, MAX_ROWS + 1)


print('wordmap generated')
print('total number of features:', len(wordmap))

['Coffee', 'Cooking', 'Law', 'Space', 'Windows_Phone', 'Wood_Working']
wordmap generated
total number of features: 4129


## Vector Mapping for Training Data


In [10]:
total_vector = []

# create a list of vectors of all the training datasets
for name in names:
    fileloc = './Dataset/Training/' + name + '.xml'
    print('>>', name, end='--')
    total_vector = total_vector + vectormapping_train(fileloc, wordmap, MAX_ROWS + 1)
    print('Done')

total_vector = np.array(total_vector)

print('total number of features in wordMap:', end=' ')
print(len(total_vector[0]))
print('training vectors of total rows:', end=' ')
print(len(total_vector[:,0]))
features = features_name(wordmap)
print('\nMost Frequent Words:')
for i in np.argsort(-total_vector.sum(axis=0))[:10]:
    print(features[i], total_vector.sum(axis=0)[i])
    

>> Coffee--Done
>> Cooking--Done
>> Law--Done
>> Space--Done
>> Windows_Phone--Done
>> Wood_Working--Done
total number of features in wordMap: 4129
training vectors of total rows: 480

Most Frequent Words:
coffee 260
phone 94
water 94
law 93
cut 84
time 80
brew 72
space 69
wood 64
roast 61


## TF-DTF vector conversion for Training Data

In [11]:
# Training
idf_nu = len(total_vector)
tf_idf_train = np.array(total_vector, dtype=np.float64)
# print(len(total_vector))

for i in range(len(total_vector)):
    # print(total_vector[i])
    # print(i)
    """
    if sum(total_vector[i]) == 0:
        print('zero', i)
        tf_idf_train[i] = tf_idf_train[i-1]
    else:
        tf_idf_train[i] = total_vector[i] / sum(total_vector[i])
    """
    tf_idf_train[i] = total_vector[i] / sum(total_vector[i])
    for j in range(len(total_vector[i])):
        idf_de = 1 + len((total_vector[:,j])[(total_vector[:,j])>0])
        idf = math.log(idf_nu/idf_de)
        tf_idf_train[i, j] *= idf

additional functions for prediction

In [12]:
# prediction
def prediction(vec, KNN=5):
    
    row_indexes = np.argsort(-vec, axis=None)[:KNN]
    # print(row_indexes, end=' ->')
    predic = []
    for r in row_indexes:
        val = MAX_ROWS

        for i in range(0,len(names)):
            if r < val:
                predic.append(names[i])
                break
            else:
                val = val + MAX_ROWS

    b = np.unique(predic, return_counts=True)[1]
    c = np.argmax(b)
    for i in range(len(b)):
        if i != c and b[i] == b[c]:
            return predic[0]
    
    return np.unique(predic)[np.argmax(np.unique(predic, return_counts=True)[1])]



def actual_result_from_id(names, total_numbers, TEST_MAX_ROWS):
    val = TEST_MAX_ROWS
    for i in range(0,len(names)):
        if total_numbers < val:
            return names[i]
        
        else:
            val = val + TEST_MAX_ROWS

### Lets try this on one test file

In [13]:

namet = 'Coffee'
fileloc = './Dataset/Test/' + namet + '.xml'
actual_result = namet
print('>>>>>>>>>>>>>>>>>>>', namet, '<<<<<<<<<<<<<<<<<<<')

TEST_MAX_ROWS = 5
test_vectors = []
extvector = []
extwordmap = []

test_vectors, extvector, extwordmap = vectormapping_test(fileloc, wordmap, TEST_MAX_ROWS + 1)
test_vectors = np.array(test_vectors)
extvector = np.array(extvector)
extwordmap = np.array(extwordmap)

for i in np.argsort(-test_vectors.sum(axis=0))[:10]:
    print(features[i], test_vectors.sum(axis=0)[i])


>>>>>>>>>>>>>>>>>>> Coffee <<<<<<<<<<<<<<<<<<<
coffee 14
grind 11
bean 7
brew 5
espresso 5
air 4
time 4
ground 3
dose 3
process 3


In [14]:

# test data
idf_nu = len(total_vector)
tf_idf_test = np.array(test_vectors, dtype=np.float64)

for i in range(len(test_vectors)):
    # first tf calculation
    tf_idf_test[i] = test_vectors[i] / (sum(test_vectors[i]) + sum(extvector[i]))
    
    # idf calculation and then tf-idf calculation
    for j in range(len(test_vectors[i])):
        idf_de = 1 + len((total_vector[:,j])[(total_vector[:,j])>0])
        idf = math.log(idf_nu/ idf_de)
        tf_idf_test[i, j] *= idf


# not in wordlist
idf = math.log(idf_nu)
tf_idf_ext = np.empty([len(extvector), 1], dtype=np.float64)

for i in range(len(extvector)):
    tf_idf_ext[i] = sum(((extvector[i] / (sum(test_vectors[i]) + sum(extvector[i]))) * idf) ** 2)




In [15]:

accurate = 0
KNN = 5
print('KNN = ', KNN)

for i in range(len(tf_idf_test)):
    numerator_set = np.multiply(tf_idf_train, tf_idf_test[i])
    denB = sum(tf_idf_test[i] ** 2) + tf_idf_ext[i]
    denA_set = tf_idf_train ** 2
    
    resulting_vec = np.empty((len(numerator_set), 1), dtype=float)
    for j in range(len(numerator_set)):
        resulting_vec[j] = sum(numerator_set[j]) / (math.sqrt(sum(denA_set[j])) * math.sqrt(denB))
    
    predicted_result = prediction(resulting_vec, KNN)
    print(actual_result,"-->", predicted_result)
    if actual_result == predicted_result:
        accurate += 1

print('accuracy:', (accurate/TEST_MAX_ROWS) * 100, '%')
# https://www.python-course.eu/python3_formatted_output.php

KNN =  5
Coffee --> Coffee
Coffee --> Coffee
Coffee --> Coffee
Coffee --> Coffee
Coffee --> Coffee
accuracy: 100.0 %


<h2><center>cosine Similarity for all the classes

In [16]:
TEST_MAX_ROWS = 20
test_vectors = []
extvector = []
extwordmap = []

for name in names:
    fileloc = './Dataset/Test/' + name + '.xml'
    print('>>>>>>>>>>>>>>>>>>>', name, '<<<<<<<<<<<<<<<<<')

    test_vectors += vectormapping_test(fileloc, wordmap, TEST_MAX_ROWS + 1)[0]
    extvector += vectormapping_test(fileloc, wordmap, TEST_MAX_ROWS + 1)[1]
    extwordmap += vectormapping_test(fileloc, wordmap, TEST_MAX_ROWS + 1)[2]

test_vectors = np.array(test_vectors)
extvector = np.array(extvector)
extwordmap = np.array(extwordmap)

print('\nMost Frequent Words:')
for i in np.argsort(-test_vectors.sum(axis=0))[:10]:
    print(features[i], test_vectors.sum(axis=0)[i])

>>>>>>>>>>>>>>>>>>> Coffee <<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>> Cooking <<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>> Law <<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>> Space <<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>> Windows_Phone <<<<<<<<<<<<<<<<<
>>>>>>>>>>>>>>>>>>> Wood_Working <<<<<<<<<<<<<<<<<

Most Frequent Words:
coffee 70
brew 43
time 38
wood 34
grind 31
water 28
temperature 23
phone 23
power 19
launch 18


In [17]:
# test data tf_idf vector creation
idf_nu = len(total_vector)
tf_idf_test = np.array(test_vectors, dtype=np.float64)

for i in range(len(test_vectors)):
    tf_idf_test[i] = test_vectors[i] / (sum(test_vectors[i]) + sum(extvector[i]))
    
    for j in range(len(test_vectors[i])):
        idf_de = 1 + len((total_vector[:,j])[(total_vector[:,j])>0])
        idf = math.log10(idf_nu/ idf_de)
        tf_idf_test[i, j] *= idf


# not in wordlist
idf = math.log(idf_nu)
tf_idf_ext = np.empty([len(extvector), 1], dtype=np.float64)

for i in range(len(extvector)):
    tf_idf_ext[i] = sum(((extvector[i] / (sum(test_vectors[i]) + sum(extvector[i]))) * idf) ** 2)
    

In [18]:
file = open('KNN cosine similarity report.txt', 'w')
file.write('KNN \taccuracy\n')

KNNs = [1, 3, 5]

for KNN in KNNs:

    accurate = 0
    total_numbers = 0
    
    print('KNN = ', KNN)
    file.write(str(KNN))

    for i in range(len(tf_idf_test)):
        numerator_set = np.multiply(tf_idf_train, tf_idf_test[i])
        denB = sum(tf_idf_test[i] ** 2) + tf_idf_ext[i]
        denA_set = tf_idf_train ** 2

        resulting_vec = np.empty((len(numerator_set), 1), dtype=float)
        for j in range(len(numerator_set)):
            resulting_vec[j] = sum(numerator_set[j]) / (math.sqrt(sum(denA_set[j])) * math.sqrt(denB))

        actual_result = actual_result_from_id(names, total_numbers, TEST_MAX_ROWS)
        predicted_result = prediction(resulting_vec, KNN)

        total_numbers += 1
        # print(actual_result,"-->", predicted_result)
        if actual_result == predicted_result:
            accurate += 1
        # else:
        #     print(actual_result, predicted_result)
    
    accuracy = round((accurate/total_numbers) * 100, 3)
    print('-----------------------------------------------')
    print('accuracy:', accuracy, '%')
    print('-----------------------------------------------')
    file.write('\t'+ str(accuracy) + '\n')

file.close()


KNN =  1
-----------------------------------------------
accuracy: 77.5 %
-----------------------------------------------
KNN =  3
-----------------------------------------------
accuracy: 79.167 %
-----------------------------------------------
KNN =  5
-----------------------------------------------
accuracy: 81.667 %
-----------------------------------------------
