Import all needed libraries

In [1]:
import re
import nltk
import numpy as np
import math
import matplotlib.pyplot as plt

Tokenizing function:

This function gets a list of texts and tokenize each text

In [2]:
def tokenize(list_of_contexts):
    all_tokens = []         # a list to save all of tokens of all documents
    for i, doc in enumerate(list_of_contexts):
        lower_doc = doc.lower()              # make all of contexts lower case
        list_of_contexts[i] = lower_doc          
        tokens = re.findall(r'\d+(?:,\d+)*(?:\.\d+)?|\w+', list_of_contexts[i])   # tokenize the text with regex
        all_tokens.append(tokens)

    return all_tokens       # return a 2D array with contains lists of tokens of each document

Filtering tokens:

This function is for preprocessing tokens like removing stopwords and stemming

In [3]:
def token_filtering(all_tokens):
    for i, doc in enumerate(all_tokens):
        new_tokens = []
        for token in doc:     # delete all of stopwords and single character tokens except numbers from token list
            if (len(token) < 2 and token.isalpha()) or (token in stop_words):          
                continue
            else:
                new_tokens.append(token)
        stemmer = nltk.stem.PorterStemmer()                      # stemming each token
        new_tokens = [stemmer.stem(token) for token in new_tokens]
        all_tokens[i] = new_tokens

    return all_tokens

# Download the stopwords
nltk.download('stopwords')
nltk.download('punkt')
# Get the list of stopwords for English
stop_words = set(nltk.corpus.stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Create a mapping list for terms to termIDs. each unique term is maped to a unique integer

In [4]:
def build_term_id(all_tokens):
    termID = []
    for doc in all_tokens:               # create a list of all unique terms in collection
        for term in doc:
            if term not in termID:    # if the term is not already in the list
                termID.append(term)

    return termID

Create a 2D array(term-document matrix) that entry i and j is the TF of term i in document j

In [5]:
def calculate_TF(tf, option, max_tf_d, max_tf_t):
    if option==1:   # binary model {0, 1}
        if tf>0:    
            return 1
        return 0
    elif option==2:    # n: TF_t,d
        return tf
    elif option==3:    # l: 1+log(TF)
        if tf>0:
            return 1 + math.log2(tf)
        return 0.1
    elif option==4:     # m: ntf
        return 0.4 + 0.6*((tf)/(max_tf_d))
    else:               # a
        return 0.5 + 0.5*((tf)/(max_tf_t))

In [6]:
def build_term_document_matrix(all_tokens, termID, option):
    term_doc = np.zeros((len(termID), len(all_tokens)))  # an empy matrix rows=len(terms) and columns=len(collection)
    # build term-doc matrix
    for i, doc in enumerate(all_tokens):
        for term in doc:
            try:   # if term is happened in the termID map
                term_ind = termID.index(term)     # row
                doc_ind = i                       # column
                term_doc[term_ind][doc_ind] += 1  # increase the frequency of the term in the document
            except:
                continue
    
    max_tf_row = []
    max_tf_col = []
    for i in range(len(term_doc)):             # find TF_max in each document and TF_max for each term
        max_tf_row.append(max(term_doc[i]))
    for j in range(len(term_doc[0])):
        col = []
        for i in range(len(term_doc)):
            col.append(term_doc[i][j])
        max_tf_col.append(max(col))

    for i in range(len(term_doc)):            # update term-doc weights based on given option
        for j in range(len(term_doc[i])):
            max_tf_t = max_tf_row[i]
            max_tf_d = max_tf_col[j]
            tf = term_doc[i][j]
            term_doc[i][j] = calculate_TF(tf, option, max_tf_d, max_tf_t)
        
    return term_doc


create an IDF list that stores document frequency of each term

In [7]:
# calculate idf based on given algorithm
def calculate_IDF(N, DF_t, option):
    if option == 1:  # n
        return 1
    elif option==2:   # t
        return math.log2(N/DF_t)
    else:   # p
        return max(0, math.log2((N - DF_t)/(DF_t)))

In [None]:
def build_idf(all_tokens, term_ID, option):
    idf = np.zeros(len(term_ID))

    for i, term in enumerate(term_ID):
        count = 0
        for doc in all_tokens:     # if term happened in document i, increase doc frequency of it
            if term in doc:
                count += 1

        idf_t = calculate_IDF(len(all_tokens), count, option)   # calculate idf_t and put it in list of idfs
        idf[i] = idf_t

    return idf