In [1]:
import sys
import gc
import re
import json
import _pickle as p
import pandas as pd
import numpy as np
from importlib import reload
from IPython.display import display, HTML

sys.path.append('..')
sys.path.append('../lib')
import stats as S
S = reload(S)

pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
display(HTML("<style>.container{width: 100%}</style>"))

In [14]:
lemm = S.lemm_h
def freq_dict(text, suff='ph'):
    txt_lemm = lemm(text)
    kwds = S.keywords(txt_lemm)
    word_cnt = len(kwds)
    word_cnt_log_inv = 1 / np.log2(word_cnt)
    kwds_sorted = sorted(kwds.items(), key=lambda x: -x[1])
    kwds_inverted = {i: word_cnt_log_inv/v for i, v in kwds.items()}
    v = kwds_inverted.values()
    with open(f'{suff}-wordimp.json', 'w') as f:
        f.write(json.dumps(kwds_inverted))
    print(f"Stats: words count: {word_cnt}, log_inv = {word_cnt_log_inv}; min = {min(v)}, max = {max(v)}")
    return kwds_inverted

def prepare_dataset(df, ds_basename, threshold=1e-2, max_len=None):
    Targ = 'Targ'
    Lemm = 'Lemm'
    Kw = 'Kw'
    KwMult = 'KwMult'
    KwFilt = 'KwFilt'
    FSum = 'Fsum'
    KwNorm = 'KwNorm'
    txt = "\n".join(df.loc[:, Targ].apply(str))
    kwds_inverted = freq_dict(txt, suff=ds_basename)
    df = df.loc[df.loc[:, Targ].notnull()]
    if max_len is not None:
        df = df.head(max_len)
    df.loc[:, Lemm] = df.loc[:, Targ].apply(lemm)
    df.loc[:, Kw] = df.loc[:, Lemm].apply(lambda x: {i: 1 for i, v in S.keywords(x).items()})
    df.loc[:, KwMult] = df.loc[:, Kw].apply(lambda x: S.multiply2stats(kwds_inverted, x, max_dist=0))
    df.loc[:, KwFilt] = df.loc[:, KwMult].apply(lambda x: {i: v for i, v in x.items() if v > max(x.values())*threshold})
    df.loc[:, FSum] = df.loc[:, KwFilt].apply(lambda x: sum(x.values()))
    df.loc[:, KwNorm] = df.apply(lambda x: {i: v/x[FSum] for i, v in x[KwFilt].items()}, axis=1)
    
    df.drop([Lemm, Kw, KwFilt, FSum], axis=1, inplace=True)
    
    df.to_csv(f'{ds_basename}-kw-thr{threshold:.6f}.csv')
    gc.collect()
    return df

In [15]:
# articles = pd.read_csv('medical_site.csv')
pharma = pd.read_csv('pharma.csv')
pharma.drop(['web-scraper-order', 'web-scraper-start-url', 'Page', 'Page-href', 'PharmaLink', 'AnalogsLinks-href', 'Lots', 'Instruction'], axis=1, inplace=True)
pharma.rename(columns={"PharmacologicAction": "Targ"}, inplace=True)

In [16]:
pattern = re.compile('[^\w\d\s]+') 
Sdict = pd.read_csv('med_terms.csv')
Sdict.drop(['web-scraper-order', 'web-scraper-start-url', 'Pages', 'Pages-href'], axis=1, inplace=True)
Sdict.rename(columns={"Definitions": "Targ"}, inplace=True)

In [17]:
Pdict = pd.read_csv('medical_profs.csv')
Pdict.drop(['web-scraper-order', 'web-scraper-start-url', 'ProfLinks-href'], axis=1, inplace=True)
Pdict.drop([1, 3, 4, 5, 11, 16, 26, 27, 33], axis=0, inplace=True)
Pdict.rename(columns={"DescriptionText": "Targ"}, inplace=True)

In [18]:
pattern = re.compile('[^\w\d\s]+') 
S2dict = pd.read_csv('medical_term2.csv')
S2dict.drop(['web-scraper-order', 'web-scraper-start-url', 'Disease-href'], axis=1, inplace=True)
S2dict.rename(columns={"Description": "Targ"}, inplace=True)

In [None]:
pharma = prepare_dataset(pharma, 'pharma', max_len=None)
Sdict = prepare_dataset(Sdict, 'def', max_len=None)
S2dict = prepare_dataset(S2dict, 'def2', max_len=None)
Pdict = prepare_dataset(Pdict, 'prof', max_len=None)

Stats: words count: 222, log_inv = 0.12829697788094419; min = 6.792152992797087e-06, max = 0.12829697788094419


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Stats: words count: 3733, log_inv = 0.08427354665375736; min = 0.0003956504537735087, max = 0.08427354665375736


In [None]:
display(Ph.head())
display(Sdict.head())
display(Pdict.head())

In [None]:
gc.collect()