In [2]:
#imports
import pandas as pd
import warnings
from glob import glob
import os
import re

from multiprocessing import Process
import concurrent.futures as cf

import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import ARRAY, String

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

import pickle
from keyword_extraction import DictLU_Extract_Exact

In [3]:
#settings 
warnings.simplefilter(action='ignore', category=FutureWarning)

## Language Detection

In [4]:
#detect language for column title and abstract
def detect_language(row):
    try:
        
        if not isinstance(row, str):
            if len(row) != 0:          
                row = row[0]
            else:
                row= "_"                    
        return detect(row)
    
    except LangDetectException:
        return 'unknown'

In [5]:
def process_lang_detection():
    #create engine
    engine = create_engine('postgresql+psycopg2://postgres:5050@localhost:5432/postgres')
    # create a process pool with 12 workers
    executor = cf.ProcessPoolExecutor(max_workers=12)

    for chunk in pd.read_sql('SELECT dbrecordid, title, abstract FROM ke_stage.ba_corpus_1', engine, chunksize=100000):
        df_res = pd.DataFrame()
        print('Got df with ' + str(len(chunk)) + ' rows')
        for i, row in chunk.iterrows():
            # entry point of the program??
            #execute function for column title
            future_title = executor.submit(detect_language, row['title'])
            detected_lang_title = future_title.result()
            #execute function for column abstract
            future_abs = executor.submit(detect_language, row['abstract'])
            detected_lang_abs = future_abs.result()
            #append to new dataframe
            df_res = df_res.append({'dbrecordid': row['dbrecordid'], 'lang_title': detected_lang_title, 'lang_abs': detected_lang_abs}, ignore_index=True)
        df_res.to_sql('corpus_language', engine, schema='ke_stage', chunksize=100000, if_exists='append', index=False)

In [6]:
process_lang_detection()

## Keyword Extraction

In [5]:
#load file for each language
files_MeSH = glob('/home/ubuntu/ullrich/my_code/data/pickle/MeSH/*.p')
files_agrovoc = glob('/home/ubuntu/ullrich/my_code/data/pickle/AGROVOC/*.p')

def load_file(file_path):
    [dicts_lower,dicts_upper] = pickle.load(open(file_path, "rb"))
    DEE = DictLU_Extract_Exact(dicts_upper,dicts_lower)
    return DEE

In [6]:
def get_keywords(file_path, row, col_lang):
    for file in file_path:
        parts = os.path.split(file)
        parts = re.split(r'_|\.', parts[1])
        if parts[2] == 'en':
            DEE_en = load_file(file)
        elif parts[2] == 'de':
            DEE_de = load_file(file)
        elif parts[2] == 'fr':
            DEE_fr = load_file(file)

    dicts = None
    if col_lang == 'en':
        dicts = DEE_en
    elif col_lang == 'de':
        dicts = DEE_de
    elif col_lang == 'fr':
        dicts = DEE_fr
    
    if dicts is not None:
        terms_id = []
        terms = []
        dicts.full(str(row))
        res = dicts.result
        for k, v in res.items():
            terms_id.extend([str(k)] * v['count'])
            terms.extend([str(v['term'])] * v['count'])
        return terms_id, terms

In [7]:
def process_keyword_extraction(files, name):
    #create engine
    engine = create_engine('postgresql+psycopg2://postgres:5050@localhost:5432/postgres')
    # create a process pool with 12 workers
    executor = cf.ProcessPoolExecutor(max_workers=12)

    for chunk in pd.read_sql('SELECT * FROM ke_stage.join_lang LIMIT 10000', engine, chunksize=1000):
        df_res = pd.DataFrame()
        print('Got df with ' + str(len(chunk)) + ' rows')
        for i, row in chunk.iterrows():
            # entry point of the program??
            #extract keywords for column title
            future_title = executor.submit(get_keywords, files, row['title'], row['lang_title'])
            result_title = future_title.result()
            #extract keywords for column abstract
            future_abs = executor.submit(get_keywords, files, row['abstract'], row['lang_abs'])
            result_abs = future_abs.result()
            if result_abs and result_title is not None:
                #append to new dataframe
                df_res = df_res.append({'dbrecordid': row['dbrecordid'], name +'_ID_title' : result_title[0] , name + '_title': result_title[1] , name + '_ID_abs' : result_abs[0] , name + '_abs': result_abs[1]}, ignore_index=True)       
        df_res.to_sql('corpus_keywords_' + name, engine, schema='ke_stage', chunksize=1000, if_exists='append', index=False, dtype={'dbrecordid': String(), name +'_ID_title': ARRAY(String()), name + '_title': ARRAY(String()), name + '_ID_abs': ARRAY(String()), name + '_abs': ARRAY(String())})

In [55]:
process_keyword_extraction(files_MeSH, 'MeSH')

Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows


In [8]:
process_keyword_extraction(files_agrovoc, 'AGROVOC')

Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
Got df with 1000 rows
