In [18]:
import pandas as pd
import string
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import ray
import numpy as np
from os import cpu_count
import pickle

In [19]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
from global_variable import REUTERS_CORPUS

In [20]:
from string import digits
_remove_digits = str.maketrans('', '', digits)

STEMMER = PorterStemmer()

In [21]:
corpus_path = '../' + REUTERS_CORPUS

In [22]:
corpus_df = pd.read_csv(corpus_path, names=['doc_id', 'title', 'content', 'topics'], 
                dtype=str, 
                na_filter=False, 
                index_col=False) 

In [23]:
corpus_df['topics'] = \
    corpus_df['topics'].apply(lambda x: x.replace('[', '').replace(']','').replace("'", '').replace(',',' '))

In [24]:
chunks = np.array_split(corpus_df, cpu_count())

In [25]:
def _process(s):
    s = s.replace('\n', ' ')
    s = s.replace('.', '')
    s = s.replace('-', ' ')
    s = s.replace("'s", '')
    s = s.translate(_remove_digits)
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    s = s.lower().strip().translate(translator)
    tmp = ' '.join(STEMMER.stem(x) for x in s.split() if x not in nltk.corpus.stopwords.words('english'))
    return tmp

@ray.remote
def worker(df_chunk):
    df_chunk['content'] = df_chunk['content'].apply(lambda x: _process(x))
    df_chunk['title'] = df_chunk['title'].apply(lambda x: _process(x))
    df_chunk['merged'] = df_chunk['title'] + ' ' + df_chunk['content']
    df_chunk.drop(columns=['title', 'content'], inplace=True)
    return df_chunk

In [26]:
ray.init(num_cpus=cpu_count())
input = [ray.put(chunk) for chunk in chunks]
r = ray.get([worker.remote(c) for c in input])
ray.shutdown()

2020-03-21 15:34:22,768	INFO resource_spec.py:212 -- Starting Ray with 2.64 GiB memory available for workers and up to 1.33 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-21 15:34:23,067	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


In [27]:
corpus_df = pd.concat(r, axis=0)

In [28]:
topics_dict = {} # k: doc_id, v: [topics]

In [29]:
# find out those have multiple topics
corpus_df['has_multiple_topics'] = corpus_df['topics'].apply(lambda x: True if (len(x.split()) >= 2) else False)

tmp = []
# handle multiple topics
def f(row):
    topics = row['topics'].split()
    topics_dict[row['doc_id']] = topics
    if row['has_multiple_topics']:
        for topic in topics:
            new_df = pd.DataFrame([[row['doc_id'], topic, row['merged']]], columns=['doc_id','topics','merged'])
            tmp.append(new_df)

corpus_df.apply(lambda x: f(x), axis=1)

corpus_df.drop(corpus_df[corpus_df['has_multiple_topics']].index, inplace=True)
corpus_df.drop(columns=['has_multiple_topics'], inplace=True)

df_ = pd.concat(tmp)
corpus_df = pd.concat([corpus_df, df_])

In [30]:
training = corpus_df[corpus_df['topics'] != '']
to_do = corpus_df[corpus_df['topics'] == '']

In [31]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(training['merged'])
y = training['topics'].to_numpy()

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X, y)

X2 = vectorizer.transform(to_do['merged'])

In [32]:
# prediction_df = pd.DataFrame(neigh.kneighbors(X2, n_neighbors=2)[1], columns=['topic1', 'topic2'], dtype=int)
prediction_df = pd.DataFrame(neigh.kneighbors(X2, n_neighbors=1)[1], columns=['topic1'], dtype=int)

prediction_df.reset_index(drop=True, inplace=True)
to_do.reset_index(drop=True, inplace=True)

to_do = pd.concat([prediction_df, to_do], axis=1)

In [33]:
def i2topic(i):
    return y[i]

def f2(row):
    topic1 = i2topic(row['topic1'])
#     topic2 = i2topic(row['topic2'])
#     topics_dict[row['doc_id']] = list(set([topic1, topic2]))
    topics_dict[row['doc_id']] = [topic1]

to_do.apply(lambda x: f2(x), axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
244    None
245    None
246    None
247    None
248    None
Length: 249, dtype: object

In [34]:
topic_index = {}

def update_topic_index(topic, doc_id):
    if topic in topic_index.keys():
        topic_index[topic].append(doc_id)
    else:
        topic_index[topic] = [doc_id]

for doc_id, topics in topics_dict.items():
    for topic in topics:
        update_topic_index(topic=topic, doc_id=doc_id)

In [35]:
with open('../../corpus/topic.idx', 'wb') as f:
    pickle.dump(topic_index, f)

In [36]:
topic_index

{'crude': ['21001',
  '21002',
  '21006',
  '21008',
  '21009',
  '21010',
  '21011',
  '21013',
  '21018',
  '21020',
  '21021',
  '21025',
  '21026',
  '21033',
  '21035',
  '21048',
  '21052',
  '21054',
  '21056',
  '21058',
  '21062',
  '21064',
  '21066',
  '21067',
  '21068',
  '21069',
  '21072',
  '21074',
  '21076',
  '21079',
  '21084',
  '21085',
  '21086',
  '21087',
  '21088',
  '21089',
  '21090',
  '21092',
  '21094',
  '21098',
  '21099',
  '21104',
  '21108',
  '21110',
  '21111',
  '21118',
  '21119',
  '21121',
  '21125',
  '21127',
  '21131',
  '21137',
  '21139',
  '21144',
  '21149',
  '21154',
  '21155',
  '21156',
  '21160',
  '21161',
  '21162',
  '21170',
  '21171',
  '21174',
  '21180',
  '21183',
  '21185',
  '21197',
  '21198',
  '21204',
  '21213',
  '21216',
  '21217',
  '21223',
  '21225',
  '21226',
  '21231',
  '21235',
  '21239',
  '21246',
  '21250',
  '21258',
  '21263',
  '21267',
  '21268',
  '21274',
  '21276',
  '21279',
  '21280',
  '21284',
 