In [9]:
#imports
import pandas as pd
import warnings

from multiprocessing import Process
import concurrent.futures as cf

import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import ARRAY, String

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

import pickle
from keyword_extraction import DictLU_Extract_Exact

In [2]:
#settings 
warnings.simplefilter(action='ignore', category=FutureWarning)

## Language Detection

In [None]:
#detect language for column title and abstract
def detect_language(row):
    try:
        
        if not isinstance(row, str):
            if len(row) != 0:          
                row = row[0]
            else:
                row= "_"                    
        return detect(row)
    
    except LangDetectException:
        return 'unknown'

In [None]:
def process_lang_detection():
    #create engine
    engine = create_engine('postgresql+psycopg2://postgres:8580@localhost:5432/postgres')
    # create a process pool with 12 workers
    executor = cf.ProcessPoolExecutor(max_workers=12)

    for chunk in pd.read_sql('SELECT dbrecordid, title, abstract FROM ke_stage.ba_corpus_1 LIMIT 10000', engine, chunksize=1000):
        df_res = pd.DataFrame()
        print('Got df with ' + str(len(chunk)) + ' rows')
        for i, row in chunk.iterrows():
            # entry point of the program??
            #execute function for column title
            future_title = executor.submit(detect_language, row['title'])
            detected_lang_title = future_title.result()
            #execute function for column abstract
            future_abs = executor.submit(detect_language, row['abstract'])
            detected_lang_abs = future_abs.result()
            #append to new dataframe
            df_res = df_res.append({'dbrecordid': row['dbrecordid'], 'lang_title': detected_lang_title, 'lang_abs': detected_lang_abs}, ignore_index=True)
        df_res.to_sql('corpus_language', engine, schema='ke_stage', chunksize=1000, if_exists='append', index=False)

In [None]:
process_lang_detection()

## Keyword Extraction

In [3]:
#load file for each language
path_terms_de = '/home/ubuntu/ullrich/keyword_extraction/pickle/MeSH_dict_german.p'
path_terms_en = '/home/ubuntu/ullrich/keyword_extraction/pickle/MeSH_dict_english.p'
path_terms_fr = '/home/ubuntu/ullrich/keyword_extraction/pickle/MeSH_dict_french.p'

def load_file (path):
    [dicts_lower,dicts_upper] = pickle.load(open(path, "rb"))
    DEE = DictLU_Extract_Exact(dicts_upper,dicts_lower)
    return DEE

In [4]:
#extract keywords for columns title and abstract
def get_keywords(row, col_lang):
    dicts = None
    if col_lang == 'en':
        dicts = load_file(path_terms_en)
    elif col_lang == 'de':
        dicts = load_file(path_terms_de)
    elif col_lang == 'fr':
        dicts = load_file(path_terms_fr)
    
    if dicts is not None:
        terms = []
        dicts.full(str(row))
        res = dicts.result
        for k, v in res.items():
            terms.append(str(v['term']))
        return terms

In [10]:
def process_keyword_extraction():
    #create engine
    engine = create_engine('postgresql+psycopg2://postgres:8580@localhost:5432/postgres')
    # create a process pool with 12 workers
    executor = cf.ProcessPoolExecutor(max_workers=12)

    for chunk in pd.read_sql('SELECT * FROM ke_stage.join_lang LIMIT 10000', engine, chunksize=1000):
        df_res = pd.DataFrame()
        print('Got df with ' + str(len(chunk)) + ' rows')
        for i, row in chunk.iterrows():
            # entry point of the program??
            #extract keywords for column title
            future_title = executor.submit(get_keywords, row['title'], row['lang_title'])
            result_title = future_title.result()
            #extract keywords for column abstract
            future_abs = executor.submit(get_keywords, row['abstract'], row['lang_abs'])
            result_abs = future_abs.result()
            #append to new dataframe
            df_res = df_res.append({'dbrecordid': row['dbrecordid'], 'keywords_title': result_title, 'keywords_abs': result_abs}, ignore_index=True)
        df_res.to_sql('corpus_keywords', engine, schema='ke_stage', chunksize=1000, if_exists='append', index=False, dtype={'dbrecordid': String(),'keywords_title':ARRAY(String()),'keywords_abs': ARRAY(String())})

In [11]:
process_keyword_extraction()

Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
