In [66]:
# imports
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import re
import math
import numpy as np
from numpy import nan
import os
from pprint import pprint

import gensim as ge
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

import pyLDAvis
from pyLDAvis import gensim
from gensim import  models
import gensim.corpora as corpora
import pyLDAvis.gensim_models
from gensim.test.utils import datapath
from gensim.models import CoherenceModel
from sklearn.model_selection import train_test_split
import pickle 

pyLDAvis.enable_notebook()

## preprocess train dataset

In [56]:
#define connection to db 
connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
#define sql queries
sql_query_agro = 'SELECT * FROM publ.corpus_keywords_agro'
sql_query_mesh = 'SELECT * FROM publ.corpus_keywords_mesh'
sql_query_class = 'SELECT dbrecordid, class FROM ke_stage.corpus_small'

#create engine
engine = create_engine(connect_string)
#read data as df
df_agro = pd.read_sql(sql_query_agro, engine)
df_mesh = pd.read_sql(sql_query_mesh, engine)
df_class = pd.read_sql(sql_query_class, engine)

abs war in anderer sprache als title - None werte im title - daher diese ersetzen mit id2

In [3]:
#merge the ids 
def join_id(df):
    value = df['id1'].fillna(df['id2'])
    df['id1'] = value
    df = df.drop(['id2'], axis=1)
    return df

In [4]:
df_agro = join_id(df_agro)
df_mesh = join_id(df_mesh)

In [5]:
#merge both df (agrovoc and mesh)
result = pd.merge(df_agro, df_mesh, on=['id1'], how='outer')
#create list of all keywords for each document
result['keywords'] = result.apply(lambda row: [val for val in row if isinstance(val, list)], axis=1)

temp_list =  []
for row in result['keywords']:
    keywords_list = []
    for l in row:
        keywords_list.extend(l)
    for x in range(len(keywords_list)):
        keywords_list[x] = keywords_list[x].lower()
    temp_list.append(keywords_list)
    
result['keywords_all'] = temp_list

In [78]:
result = result.drop(columns=['keywords'])

In [79]:
result

Unnamed: 0,id1,agro_title,agro_abs,mesh_title,mesh_abs,keywords_all
0,1066854,,"[history, game]","[Life, Life]","[Books, Books, Saskatchewan, Saskatchewan, Sas...","[history, game, life, life, books, books, sask..."
1,1066982,[Caribbean],"[Netherlands (Kingdom of the), Caribbean, Cari...","[Caribbean Region, Caribbean Region]","[Netherlands Antilles, Suriname, Caribbean Reg...","[caribbean, netherlands (kingdom of the), cari..."
2,1067820,[growth],"[China, China]","[Growth, Growth]","[China, China, China, China]","[growth, china, china, growth, growth, china, ..."
3,1067917,,"[history, physicians]",,"[Hemorrhage, Hemorrhage, Drawing, Risk, Risk, ...","[history, physicians, hemorrhage, hemorrhage, ..."
4,1068740,,"[living standards, towns, Syrian Arab Republic...",,"[Ribs, Commerce, Commerce, Ribs, Economics, Sy...","[living standards, towns, syrian arab republic..."
...,...,...,...,...,...,...
478532,AGRICOLACAT11102635,,,,"[Counseling, Counseling]","[counseling, counseling]"
478533,AGRICOLACAT11121845,,,,"[Literacy, Literacy, Weights and Measures, Wei...","[literacy, literacy, weights and measures, wei..."
478534,AGRICOLACAT31152803,,,,"[Poster, Poster]","[poster, poster]"
478535,AGRICOLACAT90934190,,,,"[Conservation of Natural Resources, Conservati...","[conservation of natural resources, conservati..."


## train model LDA

In [7]:
#split into train and test data
df_train, df_test = train_test_split(result, test_size=0.25)

In [10]:
#get list of keywords
keywords_train = df_train['keywords_all'].tolist()

In [11]:
# Create Dictionary
id2word = corpora.Dictionary(keywords_train)
# Create Corpus
keywords_str = keywords_train
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in keywords_str]
# View
print(corpus[:10])

[[(0, 2), (1, 1), (2, 6), (3, 6), (4, 2), (5, 6), (6, 4)], [(7, 3), (8, 3), (9, 2), (10, 2), (11, 2), (12, 2), (13, 1), (14, 4), (15, 2), (16, 2), (17, 6), (18, 2), (19, 4), (20, 6)], [(21, 8), (22, 2), (23, 4), (24, 2), (25, 1), (26, 2), (27, 2), (28, 2), (29, 18), (30, 2), (31, 2), (32, 2), (33, 2), (34, 1), (35, 2), (36, 2), (37, 16), (38, 4), (39, 6)], [(40, 3), (41, 12), (42, 1), (43, 18), (44, 3), (45, 4), (46, 4), (47, 6)], [(8, 2), (46, 2), (48, 2), (49, 2), (50, 2), (51, 2), (52, 4), (53, 2), (54, 3), (55, 2), (56, 3), (57, 16), (58, 6), (59, 2), (60, 2), (61, 2), (62, 2), (63, 1), (64, 2), (65, 22)], [(46, 4), (49, 16), (50, 2), (66, 2), (67, 2), (68, 4), (69, 6), (70, 2), (71, 2), (72, 9), (73, 3), (74, 3), (75, 6), (76, 2), (77, 2), (78, 3)], [(79, 2), (80, 2)], [(8, 3), (49, 2), (81, 8), (82, 2), (83, 3), (84, 2), (85, 2), (86, 2), (87, 2), (88, 2), (89, 1), (90, 4), (91, 3), (92, 1), (93, 2), (94, 2)], [(95, 2), (96, 1), (97, 3), (98, 4), (99, 2), (100, 2)], [(0, 2), (1, 

- Hyperparameter tunen?

In [12]:
# number of topics
num_topics = 5
# Build LDA model
lda_model = ge.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics
                                       )
# Print the keywords in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.023*"methods" + 0.017*"genes" + 0.011*"mice" + 0.010*"humans" + '
  '0.010*"therapeutics" + 0.009*"research" + 0.009*"cells" + 0.008*"diet" + '
  '0.007*"light" + 0.007*"attention"'),
 (1,
  '0.064*"patients" + 0.021*"methods" + 0.019*"therapeutics" + 0.014*"rats" + '
  '0.009*"mortality" + 0.009*"cells" + 0.007*"risk" + 0.006*"role" + '
  '0.006*"time" + 0.005*"liver"'),
 (2,
  '0.022*"methods" + 0.017*"patients" + 0.017*"health" + 0.012*"therapeutics" '
  '+ 0.012*"water" + 0.009*"time" + 0.009*"research" + 0.007*"disease" + '
  '0.007*"forests" + 0.007*"women"'),
 (3,
  '0.021*"soil" + 0.017*"therapeutics" + 0.016*"plants" + 0.016*"methods" + '
  '0.016*"economics" + 0.014*"patients" + 0.014*"attention" + 0.009*"growth" + '
  '0.007*"china" + 0.007*"time"'),
 (4,
  '0.037*"cells" + 0.014*"growth" + 0.012*"methods" + 0.012*"infection" + '
  '0.011*"genes" + 0.010*"role" + 0.010*"economics" + 0.009*"food" + '
  '0.009*"research" + 0.007*"sprains and strains"')]


- LDA gibt zurück: 10 TOP keywords für jeden Thema (insges. 5)
- Keywords werden gewichtet mit den Scores - zeigt wie wichtig das Wort ist für das Thema

In [14]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_train, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.

Coherence Score:  0.35419947296295884

Perplexity:  -7.407404001215185


In [15]:
LDAvis_data_filepath = os.path.join('/home/ubuntu/ullrich/my_code/data/ldavis_prepared_keywords')

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, LDAvis_data_filepath + 'keywords.html')
LDAvis_prepared

- jede Blase repräsentiert ein Thema
- je größer die Blase. umso mehr taucht das Thema im Korpus auf (desto mehr Dokumente sind im korpus vorhanden)

### intellektuelle Zuordnung 
- Medizin = 2
- Landwirtschaft = 3
- Ernährung = 
- Umweltwissenschaften = 4
- Rest = 

### alte intellektuelle Zuordnung 
-> mehr trainingsdaten - war besser geordnet
- Medizin = 1
- Landwirtschaft = 5
- Ernährung = 4
- Umweltwissenschaften = 3
- Rest = 2

In [18]:
#save LDA model
temp_file = datapath('/home/ubuntu/ullrich/my_code/data/LDA_model/lda_model')
lda_model.save(temp_file)

## implementation testdata

In [9]:
#load LDA model
temp_file = datapath('/home/ubuntu/ullrich/my_code/data/LDA_model/lda_model')
lda = models.ldamodel.LdaModel.load(temp_file)

In [10]:
# get testdata into list
keywords_test = df_test['keywords_all'].tolist()

In [42]:
def to_id_corpus(data):
    # Create Dictionary
    id2word = corpora.Dictionary(data)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data]
    return corpus, id2word

In [46]:
#predict topic 
def get_topic(liste, lda):
    to_pro = []
    corpus, id2w = to_id_corpus(liste)
    topic = lda.get_document_topics(corpus, minimum_probability=0.5, minimum_phi_value=None,
                                   per_word_topics=False)
    for t in topic:
            to_pro.append(t)
    return to_pro

In [47]:
topics = get_topic(keywords_test, lda)

In [53]:
df_test['topic'] = topics
df_test["topic"] = df_test["topic"].astype("str")
df_test["topic"] = df_test["topic"].replace(to_replace=r'[^\d|\.|\,]', value='', regex=True)
df_test["topic"] = df_test["topic"].replace('', np.nan)
df_test[["topic","certainty"]] =  df_test["topic"].apply(lambda x: pd.Series(str(x).split(",")))

In [54]:
df_test

Unnamed: 0,id1,agro_title,agro_abs,mesh_title,mesh_abs,keywords,keywords_all,combined,topic,certainty
338020,AGRICOLAIND20439684,[genetic markers],"[products, Glycine max, progeny, cartography, ...","[Genetic Markers, Genetic Markers, Wills, Soyb...","[Soybeans, Soybeans, Soybeans, Wills, Soybeans...","[[genetic markers], [products, Glycine max, pr...","[genetic markers, products, glycine max, proge...","genetic markers, products, glycine max, progen...",4,0.52435005
269016,M35399217,[evaluation],"[Momordica charantia, methods, glutathione, di...",,"[Rats, Cholesterol, Cholesterol, Methods, Meth...","[[evaluation], [Momordica charantia, methods, ...","[evaluation, momordica charantia, methods, glu...","evaluation, momordica charantia, methods, glut...",0,0.9852629
60575,AGRICOLAIND605620910,,"[economic distribution, approximation, approxi...",,"[Methods, Methods, Methods, Methods, Probabili...","[[economic distribution, approximation, approx...","[economic distribution, approximation, approxi...","economic distribution, approximation, approxim...",3,0.75499094
107730,M7452196,,"[environment, experimentation, iris (eye), iri...","[Cornea, Xenopus laevis, Xenopus laevis, Cornea]","[Environment, Iris, Iris, Iris, Iris, Role, Ro...","[[environment, experimentation, iris (eye), ir...","[environment, experimentation, iris (eye), iri...","environment, experimentation, iris (eye), iris...",2,0.98198366
107512,M7263312,,[Ghana],"[Burkitt Lymphoma, Burkitt Lymphoma]","[Patients, Patients, Risk, Risk, Ghana, Patien...","[[Ghana], [Burkitt Lymphoma, Burkitt Lymphoma]...","[ghana, burkitt lymphoma, burkitt lymphoma, pa...","ghana, burkitt lymphoma, burkitt lymphoma, pat...",3,0.96282005
...,...,...,...,...,...,...,...,...,...,...
301899,BASE::ftdoajarticles:oai:doaj.org/article:dc57...,,"[equipment, drugs, redmouth disease, statistic...","[Machine Learning, Machine Learning, Risk, Ris...","[Safety, Safety, Workflow, Workflow, Risk, Ris...","[[equipment, drugs, redmouth disease, statisti...","[equipment, drugs, redmouth disease, statistic...","equipment, drugs, redmouth disease, statistica...",,
187492,BASE::ftdoajarticles:oai:doaj.org/article:90c1...,"[buildings, Ghana]","[housing, buildings, buildings, buildings, law...","[Ghana, Ghana, Wheelchairs, Wheelchairs]","[Activities of Daily Living, Activities of Dai...","[[buildings, Ghana], [housing, buildings, buil...","[buildings, ghana, housing, buildings, buildin...","buildings, ghana, housing, buildings, building...",0,0.80783355
477499,M3100915,,,"[Shigella dysenteriae, Shiga Toxin, Shiga Toxi...","[Isoelectric Point, Cells, Cells, Cells, Cells...","[[Shigella dysenteriae, Shiga Toxin, Shiga Tox...","[shigella dysenteriae, shiga toxin, shiga toxi...","shigella dysenteriae, shiga toxin, shiga toxin...",,
277181,TIB729405060,[globalization],"[globalization, globalization]","[Internationality, Internationality]","[Internationality, Internationality, Internati...","[[globalization], [globalization, globalizatio...","[globalization, globalization, globalization, ...","globalization, globalization, globalization, i...",1,0.5591595


In [70]:
df_test = df_test.rename(columns={'id1':'dbrecordid'})
final_df = pd.merge(df_test, df_class, on=['dbrecordid'], how='inner')
final_df = final_df.drop(columns=['keywords'])

In [71]:
final_df

Unnamed: 0,dbrecordid,agro_title,agro_abs,mesh_title,mesh_abs,keywords_all,combined,topic,certainty,class
0,AGRICOLAIND20439684,[genetic markers],"[products, Glycine max, progeny, cartography, ...","[Genetic Markers, Genetic Markers, Wills, Soyb...","[Soybeans, Soybeans, Soybeans, Wills, Soybeans...","[genetic markers, products, glycine max, proge...","genetic markers, products, glycine max, progen...",4,0.52435005,Landwirtschaft
1,M35399217,[evaluation],"[Momordica charantia, methods, glutathione, di...",,"[Rats, Cholesterol, Cholesterol, Methods, Meth...","[evaluation, momordica charantia, methods, glu...","evaluation, momordica charantia, methods, glut...",0,0.9852629,ErnÃ¤hrung
2,AGRICOLAIND605620910,,"[economic distribution, approximation, approxi...",,"[Methods, Methods, Methods, Methods, Probabili...","[economic distribution, approximation, approxi...","economic distribution, approximation, approxim...",3,0.75499094,Rest
3,M7452196,,"[environment, experimentation, iris (eye), iri...","[Cornea, Xenopus laevis, Xenopus laevis, Cornea]","[Environment, Iris, Iris, Iris, Iris, Role, Ro...","[environment, experimentation, iris (eye), iri...","environment, experimentation, iris (eye), iris...",2,0.98198366,Umweltwissenschaften
4,M7263312,,[Ghana],"[Burkitt Lymphoma, Burkitt Lymphoma]","[Patients, Patients, Risk, Risk, Ghana, Patien...","[ghana, burkitt lymphoma, burkitt lymphoma, pa...","ghana, burkitt lymphoma, burkitt lymphoma, pat...",3,0.96282005,Rest
...,...,...,...,...,...,...,...,...,...,...
119630,BASE::ftdoajarticles:oai:doaj.org/article:dc57...,,"[equipment, drugs, redmouth disease, statistic...","[Machine Learning, Machine Learning, Risk, Ris...","[Safety, Safety, Workflow, Workflow, Risk, Ris...","[equipment, drugs, redmouth disease, statistic...","equipment, drugs, redmouth disease, statistica...",,,Rest
119631,BASE::ftdoajarticles:oai:doaj.org/article:90c1...,"[buildings, Ghana]","[housing, buildings, buildings, buildings, law...","[Ghana, Ghana, Wheelchairs, Wheelchairs]","[Activities of Daily Living, Activities of Dai...","[buildings, ghana, housing, buildings, buildin...","buildings, ghana, housing, buildings, building...",0,0.80783355,Umweltwissenschaften
119632,M3100915,,,"[Shigella dysenteriae, Shiga Toxin, Shiga Toxi...","[Isoelectric Point, Cells, Cells, Cells, Cells...","[shigella dysenteriae, shiga toxin, shiga toxi...","shigella dysenteriae, shiga toxin, shiga toxin...",,,ErnÃ¤hrung
119633,TIB729405060,[globalization],"[globalization, globalization]","[Internationality, Internationality]","[Internationality, Internationality, Internati...","[globalization, globalization, globalization, ...","globalization, globalization, globalization, i...",1,0.5591595,Umweltwissenschaften


In [72]:
final_df = final_df.drop(columns=['agro_title', 'agro_abs', 'mesh_title', 'mesh_abs'])

In [74]:
final_df = final_df.drop(columns=['keywords_all'])

In [76]:
final_df.to_csv('/home/ubuntu/ullrich/my_code/data/predicted_LDA.csv', sep=',')

diagramm

In [64]:
def count_class_pop(df):
    counted = pd.DataFrame()
    counted['class'] = df['class'].value_counts()
    counted["population"] = counted['class'].values / len(df)
    counted["pop_perc"] = counted['population'].values * 100
    lowest_c = counted.min()['class']
    return counted , lowest_c

In [65]:
counted, lowest_c = count_class_pop(final_df)

In [67]:
count_series =  final_df.groupby(['class','topic'])['class'].count()
count_series = count_series.to_frame()
count_series.columns = ['count']
count_series = count_series.reset_index()
print("Anzahl der Publiktationen pro Klasse:" ,int(lowest_c))
ax = sns.barplot(data=count_series,x="count",  y="class", orient="h", hue="topic")
ax.set(xlabel="Menge an Zugewiesenen Topics pro Klasse",ylabel="Klasse(Averbis) & Topic(LDA)")
ax.text(x=0.5, y=1.1, s='Klassifizierungsgenauigkeit des LDA Models anhand des Testdatensatzes',
        fontsize=13, weight='bold', 
        ha='center', va='bottom', transform=ax.transAxes)
ax.text(x=0.5, y=1.05, s="bei einer Testdatensatzgröße von " + str(len(df_test))+ " Publikationen und "+ str(num_topics)+' "Topics"',
        fontsize=8, alpha=0.75, ha='center', va='bottom', transform=ax.transAxes)

fig = ax.get_figure()
fig.savefig("grafiken\\Klasse_zu_pub_"+str(len(count_series))+"_topics_"+str(num_topics)+".png",dpi=300, bbox_inches = "tight") 
fig.clf()
count_series.to_csv("lda_score_csv\\topic_population_at_"+str(len(count_series))+"_topics_"+str(num_topics)+".csv")
num_topics = num_topics + 4

size = len(count_series)/2

Anzahl der Publiktationen pro Klasse: 22719


<Figure size 640x480 with 0 Axes>