# Keyword Extraction
### extract keywords from MeSH and AGROVOC

dataset: join_language 

In [5]:
#imports
import pandas as pd
import warnings
from glob import glob
import os
import re

import concurrent.futures as cf
from time import perf_counter

from sqlalchemy import create_engine
from sqlalchemy import ARRAY, String

import pickle
from keyword_extraction import DictLU_Extract_Exact

#settings 
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
#load file for each language
files_MeSH = glob('/home/ubuntu/ullrich/my_code/data/pickle/MeSH/*.p')
files_agrovoc = glob('/home/ubuntu/ullrich/my_code/data/pickle/AGROVOC/*.p')

def load_file(file_path):
    [dicts_lower,dicts_upper] = pickle.load(open(file_path, "rb"))
    DEE = DictLU_Extract_Exact(dicts_upper,dicts_lower)
    return DEE

In [7]:
def get_keywords(file_path, row, col_lang):
    #load file for each language
    for file in file_path:
        parts = os.path.split(file)
        parts = re.split(r'_|\.', parts[1])
        if parts[2] == 'en':
            DEE_en = load_file(file)
        elif parts[2] == 'de':
            DEE_de = load_file(file)
        elif parts[2] == 'fr':
            DEE_fr = load_file(file)
    #choose dict for the different languages
    dicts = None
    if col_lang == 'en':
        dicts = DEE_en
    elif col_lang == 'de':
        dicts = DEE_de
    elif col_lang == 'fr':
        dicts = DEE_fr
    #add terms and ID into list
    if dicts is not None:
        terms_id = []
        terms = []
        dicts.full(str(row))
        res = dicts.result
        for k, v in res.items():
            terms_id.extend([str(k)] * v['count'])
            terms.extend([str(v['term'])] * v['count'])
        return terms_id, terms

In [8]:
def process_keyword_extraction(files, name):
    chunksize = 10000
    connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
    sql_query = 'SELECT * FROM ke_stage.join_language'
    
    #create engine
    engine = create_engine(connect_string)
    connection = engine.connect().execution_options(stream_results=True, max_row_buffer=chunksize)
    # create a process pool with 13 workers
    executor = cf.ProcessPoolExecutor(max_workers=13)

    start = perf_counter()
    for chunk in pd.read_sql(sql_query, connection, chunksize=chunksize):
        df_res = pd.DataFrame()
        for i, row in chunk.iterrows():
            #extract keywords for column title
            future_title = executor.submit(get_keywords, files, row['title'], row['lang_title'])
            result_title = future_title.result()
            #extract keywords for column abstract
            future_abs = executor.submit(get_keywords, files, row['abstract'], row['lang_abs'])
            result_abs = future_abs.result()
            if result_title or result_abs is not None:
                if result_title is None:
                    df_res = df_res.append({'dbrecordid': row['dbrecordid'], name +'_ID_title' : result_title , name + '_title': result_title, name + '_ID_abs' : result_abs[0] , name + '_abs': result_abs[1]}, ignore_index=True)
                elif result_abs is None: 
                    df_res = df_res.append({'dbrecordid': row['dbrecordid'], name +'_ID_title' : result_title[0] , name + '_title': result_title[1], name + '_ID_abs' : result_abs , name + '_abs': result_abs}, ignore_index=True)    
                else:
                    df_res = df_res.append({'dbrecordid': row['dbrecordid'], name +'_ID_title' : result_title[0] , name + '_title': result_title[1] , name + '_ID_abs' : result_abs[0] , name + '_abs': result_abs[1]}, ignore_index=True)       
        df_res.to_sql('corpus_keywords_' + name, engine, schema='ke_stage', chunksize=chunksize, if_exists='append', index=False, dtype={'dbrecordid': String(), name +'_ID_title': ARRAY(String()), name + '_title': ARRAY(String()), name + '_ID_abs': ARRAY(String()), name + '_abs': ARRAY(String())})
        end = perf_counter()
        elapsed_time = end - start
        print('Edit ' + str(len(chunk)) + ' rows in ' + str(end) + ' s. Total: ' + str(elapsed_time) + ' s.')

In [10]:
process_keyword_extraction(files_MeSH, 'MeSH')

Edit 10000 rows in 703145.529395072 s. Total: 6117.478247025982 s.
Edit 10000 rows in 709323.867693141 s. Total: 12295.816545094945 s.
Edit 10000 rows in 715500.915343453 s. Total: 18472.864195406903 s.
Edit 10000 rows in 721750.730994948 s. Total: 24722.679846901912 s.
Edit 10000 rows in 727969.242663411 s. Total: 30941.191515364917 s.
Edit 10000 rows in 734098.431256938 s. Total: 37070.380108891986 s.
Edit 10000 rows in 740439.274141417 s. Total: 43411.22299337096 s.
Edit 10000 rows in 747234.614915376 s. Total: 50206.563767329906 s.
Edit 10000 rows in 753356.086597276 s. Total: 56328.035449229996 s.
Edit 10000 rows in 759378.688998607 s. Total: 62350.63785056095 s.
Edit 10000 rows in 765575.481287913 s. Total: 68547.430139867 s.
Edit 10000 rows in 771655.195219803 s. Total: 74627.14407175698 s.
Edit 10000 rows in 778163.712056842 s. Total: 81135.66090879601 s.
Edit 10000 rows in 784295.086937016 s. Total: 87267.03578896995 s.
Edit 10000 rows in 790594.8465471 s. Total: 93566.7953990

In [9]:
process_keyword_extraction(files_agrovoc, 'AGROVOC')

Edit 10000 rows in 667549.995268458 s. Total: 571.8399559800746 s.
Edit 10000 rows in 668131.623224069 s. Total: 1153.4679115910549 s.
Edit 10000 rows in 668713.796298524 s. Total: 1735.6409860460553 s.
Edit 10000 rows in 669305.003151965 s. Total: 2326.847839487018 s.
Edit 10000 rows in 669889.201015021 s. Total: 2911.0457025429932 s.
Edit 10000 rows in 670459.752755358 s. Total: 3481.59744288004 s.
Edit 10000 rows in 671029.241632581 s. Total: 4051.0863201030297 s.
Edit 10000 rows in 671607.272096965 s. Total: 4629.116784487036 s.
Edit 10000 rows in 672184.760121977 s. Total: 5206.6048094989965 s.
Edit 10000 rows in 672755.94978482 s. Total: 5777.794472342008 s.
Edit 10000 rows in 673323.882291332 s. Total: 6345.726978854043 s.
Edit 10000 rows in 673901.845935642 s. Total: 6923.6906231640605 s.
Edit 10000 rows in 674483.210670053 s. Total: 7505.055357575067 s.
Edit 10000 rows in 675065.154519035 s. Total: 8086.999206557055 s.
Edit 10000 rows in 675645.104332554 s. Total: 8666.9490200