In [None]:
from bs4 import BeautifulSoup
import os
import csv

In [None]:
REUTERS_SOURCE_DIR = '../corpus/reuters21578/'
REUTERS_CORPUS_OUTPUT = '../corpus/reuters_corpus.csv'

In [None]:
def preprocess_reuters_corpus():

    target_files = [f for f in os.listdir(REUTERS_SOURCE_DIR) if f.endswith('.sgm')]
    target_files = sorted(target_files)

    # Extract information
    doc_ids = []
    contents = []
    topics_lst = []
    titles = []
    for target_file in target_files:
        with open(REUTERS_SOURCE_DIR + target_file, 'r', encoding='ISO-8859-1') as f:
            src_file = f.read()
        soup = BeautifulSoup(src_file, 'html.parser')
        for e in soup.find_all('reuters'):
            doc_id = e.get('newid')
            content = e.find('text')
            main_content = (lambda x: x.string if x is not None else '')(content.find('body'))
            topics = (lambda x: list(map(lambda y: y.string, x)) if x != [] else x)(e.find('topics').find_all('d'))
            title = (lambda x: x.string if x is not None else '')(content.find('title'))

            doc_ids.append(doc_id)
            contents.append(main_content.strip())
            topics_lst.append(topics)
            titles.append(title)

    # Write corpus
    with open(REUTERS_CORPUS_OUTPUT, 'w', newline='') as o:
        writer = csv.writer(o)
        for doc_id, title, content, topics in zip(doc_ids, contents, topics_lst, titles):
            writer.writerow([doc_id, title, content, topics])


In [1]:
from time import time

In [2]:
import numpy as np
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from text_processing import process
from index_configuration import IndexConfiguration

In [3]:
corpus = []
doc_id_list = []
with open('../course_corpus_full.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        doc_id = row[0]
        title = row[1]
        content = row[2]
        doc_id_list.append(doc_id)
        corpus.append(title + ' '+ content)

In [4]:
def preprocessor(s):
    return ' '.join(process(s, config=IndexConfiguration(True,True,True)))

In [5]:
pipe = Pipeline([('count', CountVectorizer(preprocessor=preprocessor)),
                 ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=True))]).fit(corpus)

# total term frequency over corpus
tf_over_corpus = [sum(e) for e in pipe['count'].transform(corpus).transpose().toarray().tolist()]
tf_over_corpus_dct = {}

terms = pipe['count'].get_feature_names()

for i in range(0, len(terms)):
    tf_over_corpus_dct[terms[i]] = tf_over_corpus[i]
    
# pipe['count'].transform(corpus).toarray()

tf_idf_matrix = pipe.transform(corpus)

In [26]:


def construct_inverted_index(corpus_path: str):
    
    # Read corpus
    corpus = []
    doc_id_list = []
    with open(corpus_path, 'r') as f:
        reader = csv.reader(f)
        for row in reader:
            doc_id = row[0]
            title = row[1]
            content = row[2]
            doc_id_list.append(doc_id)
            corpus.append(title + ' ' + content)
            
    def preprocessor(s):
        return ' '.join(process(s, config=IndexConfiguration(True,True,True)))
            
    pipe = Pipeline([('count', CountVectorizer(preprocessor=preprocessor)),
                 ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=True))]).fit(corpus)
    
    terms = pipe['count'].get_feature_names()
    
    # total term frequency over corpus
    tf_over_corpus = [sum(e) for e in pipe['count'].transform(corpus).transpose().toarray().tolist()]
    tf_over_corpus_dct = {}
    for i in range(0, len(terms)):
        tf_over_corpus_dct[terms[i]] = tf_over_corpus[i]

    # tf-idf matrix
    tfidf_matrix = pipe.transform(corpus)
    tfidf_matrix_T_list = tfidf_matrix.transpose().toarray().tolist()
    
    # inverted index
    inverted_index = {}
    for i, term in enumerate(terms):
        postings = []
        tfidf_v_lst = tfidf_matrix_T_list[i]
        for tfidf_v, doc_id in zip(tfidf_v_lst, doc_id_list):
            if tfidf_v > 0:
                postings.append(doc_id)
        inverted_index[term] = postings
            
    return inverted_index, tfidf_matrix, terms, tf_over_corpus_dct

In [27]:
start = time()
tmp = construct_inverted_index('../course_corpus_full.csv')
print('Time elapsed: %s' % (time()-start))

Time elapsed: 0.07427811622619629


In [28]:
tmp[3]

{'4106': 1,
 '4107': 1,
 'adversari': 1,
 'agent': 1,
 'approach': 1,
 'artifici': 4,
 'aspect': 1,
 'basic': 2,
 'browser': 1,
 'client': 1,
 'csi': 2,
 'deduct': 1,
 'element': 1,
 'engin': 1,
 'index': 1,
 'inform': 7,
 'intellig': 4,
 'internet': 1,
 'introduct': 2,
 'knowledg': 2,
 'languag': 1,
 'learn': 1,
 'linguist': 1,
 'machin': 1,
 'method': 1,
 'natur': 1,
 'plan': 1,
 'principl': 1,
 'process': 2,
 'program': 1,
 'queri': 1,
 'reason': 1,
 'relat': 1,
 'represent': 1,
 'retriev': 6,
 'root': 1,
 'scope': 1,
 'search': 4,
 'server': 2,
 'side': 1,
 'the': 1,
 'uncertainti': 1,
 'unit': 2,
 'web': 1,
 'wide': 1,
 'world': 1}

In [31]:
np.half(np.log10(np.int32(12)/np.int32(13)))

-0.03476

In [33]:
from scipy.sparse import csr_matrix

In [63]:
t = csr_matrix([1,2,3])
t.data = t.data.astype(np.int32)

In [81]:
t = t.astype(np.half)

In [82]:
t

<1x3 sparse matrix of type '<class 'numpy.float16'>'
	with 3 stored elements in Compressed Sparse Row format>