In [1]:
import pandas as pd
import string
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import ray
import numpy as np
from os import cpu_count

In [2]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
from global_variable import REUTERS_CORPUS

In [4]:
from string import digits
_remove_digits = str.maketrans('', '', digits)

STEMMER = PorterStemmer()

In [None]:
corpus_path = '../' + REUTERS_CORPUS

In [5]:
corpus_df = pd.read_csv(corpus_path, names=['doc_id', 'title', 'content', 'topics'], 
                dtype=str, 
                na_filter=False, 
                index_col=False) 

NameError: name 'corpus_path' is not defined

In [None]:
corpus_df['topics'] = \
    corpus_df['topics'].apply(lambda x: x.replace('[', '').replace(']','').replace("'", '').replace(',',' '))

In [None]:
chunks = np.array_split(corpus_df, cpu_count())

In [None]:
def _process(s):
    s = s.replace('\n', ' ')
    s = s.replace('.', '')
    s = s.replace('-', ' ')
    s = s.replace("'s", '')
    s = s.translate(_remove_digits)
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    s = s.lower().strip().translate(translator)
    tmp = ' '.join(STEMMER.stem(x) for x in s.split() if x not in nltk.corpus.stopwords.words('english'))
    return tmp

@ray.remote
def worker(df_chunk):
    df_chunk['content'] = df_chunk['content'].apply(lambda x: _process(x))
    df_chunk['title'] = df_chunk['title'].apply(lambda x: _process(x))
    df_chunk['merged'] = df_chunk['title'] + ' ' + df_chunk['content']
    df_chunk.drop(columns=['title', 'content'], inplace=True)
    return df_chunk

In [None]:
ray.init(num_cpus=cpu_count())
input = [ray.put(chunk) for chunk in chunks]
r = ray.get([worker.remote(c) for c in input])
ray.shutdown()

# corpus_df['content'] = corpus_df['content'].apply(lambda x: _process(x))
# corpus_df['title'] = corpus_df['title'].apply(lambda x: _process(x))

# corpus_df['merged'] = corpus_df['title'] + ' ' + corpus_df['content']
# corpus_df.drop(columns=['title', 'content'], inplace=True)

In [None]:
corpus_df = pd.concat(r, axis=0)

In [None]:
topics_dict = {} # k: doc_id, v: [topics]

In [None]:
# find out those have multiple topics
corpus_df['has_multiple_topics'] = corpus_df['topics'].apply(lambda x: True if (len(x.split()) >= 2) else False)

tmp = []
# handle multiple topics
def f(row):
    topics = row['topics'].split()
    topics_dict[row['doc_id']] = topics
    if row['has_multiple_topics']:
        for topic in topics:
            new_df = pd.DataFrame([[row['doc_id'], topic, row['merged']]], columns=['doc_id','topics','merged'])
            tmp.append(new_df)

corpus_df.apply(lambda x: f(x), axis=1)

corpus_df.drop(corpus_df[corpus_df['has_multiple_topics']].index, inplace=True)
corpus_df.drop(columns=['has_multiple_topics'], inplace=True)

df_ = pd.concat(tmp)
corpus_df = pd.concat([corpus_df, df_])

In [None]:
training = corpus_df[corpus_df['topics'] != '']
to_do = corpus_df[corpus_df['topics'] == '']

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(training['merged'])
y = training['topics'].to_numpy()

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X, y)

X2 = vectorizer.transform(to_do['merged'])

In [None]:
prediction_df = pd.DataFrame(neigh.kneighbors(X2, n_neighbors=2)[1], columns=['topic1', 'topic2'], dtype=int)

prediction_df.reset_index(drop=True, inplace=True)
to_do.reset_index(drop=True, inplace=True)

to_do = pd.concat([prediction_df, to_do], axis=1)

In [None]:
def i2topic(i):
    return y[i]

def f2(row):
    topic1 = i2topic(row['topic1'])
    topic2 = i2topic(row['topic2'])
    topics_dict[row['doc_id']] = list(set([topic1, topic2]))

to_do.apply(lambda x: f2(x), axis=1)

In [None]:
topic_index = {}

def update_topic_index(topic, doc_id):
    if topic in topic_index.keys():
        topics_dict[topic].append(doc_id)
    else:
        topics_dict[topic] = [doc_id]

for doc_id, topics in topic_dict.items():
    for topic in topics:
        update_topic_index(topic=topic, doc_id=doc_id)

In [None]:
with open('../../corpus/topic.idx', 'wb') as f:
    pickle.dump(topic_index, f)

In [None]:
topic_index

In [None]:
import pickle
path = '../corpus/topics.dict'
with open(path, 'wb') as f:
    pickle.dump(topics_dict, f)

In [None]:
# with open(path, 'rb') as f:
#     dct = pickle.load(f)