In [1]:
import pandas as pd
import string
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import ray
import numpy as np
from os import cpu_count

In [2]:
from string import digits
_remove_digits = str.maketrans('', '', digits)

STEMMER = PorterStemmer()

In [3]:
corpus_path = '../corpus/reuters_corpus.csv'

In [4]:
corpus_df = pd.read_csv(corpus_path, names=['doc_id', 'title', 'content', 'topics'], 
                dtype=str, 
                na_filter=False, 
                index_col=False) 

In [5]:
corpus_df['topics'] = \
    corpus_df['topics'].apply(lambda x: x.replace('[', '').replace(']','').replace("'", '').replace(',',' '))

In [6]:
chunks = np.array_split(corpus_df, cpu_count())

In [7]:
def _process(s):
    s = s.replace('\n', ' ')
    s = s.replace('.', '')
    s = s.replace('-', ' ')
    s = s.replace("'s", '')
    s = s.translate(_remove_digits)
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    s = s.lower().strip().translate(translator)
    tmp = ' '.join(STEMMER.stem(x) for x in s.split() if x not in nltk.corpus.stopwords.words('english'))
    return tmp

@ray.remote
def worker(df_chunk):
    df_chunk['content'] = df_chunk['content'].apply(lambda x: _process(x))
    df_chunk['title'] = df_chunk['title'].apply(lambda x: _process(x))
    df_chunk['merged'] = df_chunk['title'] + ' ' + df_chunk['content']
    df_chunk.drop(columns=['title', 'content'], inplace=True)
    return df_chunk

In [8]:
ray.init(num_cpus=cpu_count())
input = [ray.put(chunk) for chunk in chunks]
r = ray.get([worker.remote(c) for c in input])
ray.shutdown()

# corpus_df['content'] = corpus_df['content'].apply(lambda x: _process(x))
# corpus_df['title'] = corpus_df['title'].apply(lambda x: _process(x))

# corpus_df['merged'] = corpus_df['title'] + ' ' + corpus_df['content']
# corpus_df.drop(columns=['title', 'content'], inplace=True)

2020-03-10 17:31:32,943	INFO resource_spec.py:212 -- Starting Ray with 2.49 GiB memory available for workers and up to 1.26 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-10 17:31:33,350	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


In [9]:
corpus_df = pd.concat(r, axis=0)

In [10]:
topics_dict = {} # k: doc_id, v: [topics]

In [11]:
# find out those have multiple topics
corpus_df['has_multiple_topics'] = corpus_df['topics'].apply(lambda x: True if (len(x.split()) >= 2) else False)

tmp = []
# handle multiple topics
def f(row):
    topics = row['topics'].split()
    topics_dict[row['doc_id']] = topics
    if row['has_multiple_topics']:
        for topic in topics:
            new_df = pd.DataFrame([[row['doc_id'], topic, row['merged']]], columns=['doc_id','topics','merged'])
            tmp.append(new_df)

corpus_df.apply(lambda x: f(x), axis=1)

corpus_df.drop(corpus_df[corpus_df['has_multiple_topics']].index, inplace=True)
corpus_df.drop(columns=['has_multiple_topics'], inplace=True)

df_ = pd.concat(tmp)
corpus_df = pd.concat([corpus_df, df_])

In [12]:
training = corpus_df[corpus_df['topics'] != '']
to_do = corpus_df[corpus_df['topics'] == '']

In [13]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(training['merged'])
y = training['topics'].to_numpy()

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X, y)

X2 = vectorizer.transform(to_do['merged'])

In [14]:
prediction_df = pd.DataFrame(neigh.kneighbors(X2, n_neighbors=2)[1], columns=['topic1', 'topic2'], dtype=int)

prediction_df.reset_index(drop=True, inplace=True)
to_do.reset_index(drop=True, inplace=True)

to_do = pd.concat([prediction_df, to_do], axis=1)

In [15]:
def i2topic(i):
    return y[i]

def f2(row):
    topic1 = i2topic(row['topic1'])
    topic2 = i2topic(row['topic2'])
    topics_dict[row['doc_id']] = list(set([topic1, topic2]))

to_do.apply(lambda x: f2(x), axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
10206    None
10207    None
10208    None
10209    None
10210    None
Length: 10211, dtype: object

In [18]:
import pickle
path = '../corpus/topics.dict'
with open(path, 'wb') as f:
    pickle.dump(topics_dict, f)

In [19]:
# with open(path, 'rb') as f:
#     dct = pickle.load(f)

{'1': ['cocoa'],
 '2': ['acq'],
 '3': ['money-supply', 'acq'],
 '4': ['acq', 'earn'],
 '5': ['grain', 'wheat', 'corn', 'barley', 'oat', 'sorghum'],
 '6': ['veg-oil',
  'linseed',
  'lin-oil',
  'soy-oil',
  'sun-oil',
  'soybean',
  'oilseed',
  'corn',
  'sunseed',
  'grain',
  'sorghum',
  'wheat'],
 '7': ['acq'],
 '8': ['reserves', 'money-supply'],
 '9': ['earn'],
 '10': ['acq'],
 '11': ['earn'],
 '12': ['earn', 'acq'],
 '13': ['earn'],
 '14': ['earn'],
 '15': ['acq'],
 '16': ['acq', 'earn'],
 '17': ['acq'],
 '18': ['earn'],
 '19': ['wheat', 'grain'],
 '20': ['money-supply', 'interest'],
 '21': ['acq'],
 '22': ['copper'],
 '23': ['earn'],
 '24': ['earn'],
 '25': ['acq'],
 '26': ['interest'],
 '27': ['earn'],
 '28': ['crude'],
 '29': ['housing'],
 '30': ['money-supply'],
 '31': ['money-supply'],
 '32': ['wheat', 'grain'],
 '33': ['acq'],
 '34': ['acq'],
 '35': ['earn'],
 '36': ['earn'],
 '37': ['earn'],
 '38': ['earn'],
 '39': ['acq'],
 '40': ['earn'],
 '41': ['earn'],
 '42': ['coffe