# Implementierung Daten zum Vergleich

In [1]:
#imports
import pandas as pd
import sqlalchemy
from sqlalchemy import text
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from pandarallel import pandarallel
from tqdm import tqdm
stop_words = set(stopwords.words('english') + stopwords.words('german') + stopwords.words('french') + stopwords.words('spanish'))
nltk.download('wordnet')
from sqlalchemy import create_engine, select
import pprint

pandarallel.initialize(progress_bar=False,nb_workers=20)
tqdm.pandas()

import gensim as ge
import matplotlib.pyplot as plt
import seaborn as sns

import pyLDAvis
from pyLDAvis import gensim
from gensim import  models
import gensim.corpora as corpora
import pyLDAvis.gensim_models
from gensim.test.utils import datapath
from gensim.models import CoherenceModel
from sklearn.model_selection import train_test_split
import pickle
import os
import numpy as np

import itertools
from itertools import permutations
from sklearn import metrics

pyLDAvis.enable_notebook()

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#get data from db
engine = create_engine('postgresql+psycopg2://postgres:5050@localhost:5432/postgres')
sql_query_class = 'SELECT dbrecordid, class FROM ke_stage.corpus_small'
df_class = pd.read_sql(sql_query_class, engine)

In [3]:
#Remove stopwords
def remove_special_chars(text):
    import re
    return re.sub('(^\{\")|(\"\}$)|(^\{)|(\}$)', '', text)


  return re.sub('(^\{\")|(\"\}$)|(^\{)|(\}$)', '', text)


In [4]:
#lemmatization of the words
def lemma(x):
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    x = x.lower()
    x = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)]
    return x

In [5]:
#preprocess
def gensim_pre(x):
    import gensim
    x = gensim.utils.simple_preprocess(str(x), deacc=True)
    return x

In [6]:
#create dataframe with preprocessed data
dfs=[]
collist= ["abstract","title"]
conn = engine.connect()
result = conn.execution_options(stream_results=True).execute(select(([text("""dbrecordid,
                                                                              abstract,
                                                                              title
                                                                              FROM  ke_stage.corpus_small
                                                                              """)] )))
while chunk:= result.fetchmany(100000): ## only get x rows at a time
  df = pd.DataFrame (chunk)
  for column in df:
    print(column)
    df[column] = df[column].astype(str)
    df[column] = df[column].parallel_apply(remove_special_chars)
    if column in collist:
        print("lemma")
        df[column] = df[column].parallel_apply(lemma)
        print("gensim")
        df[column] = df[column].parallel_apply(gensim_pre)
        df[column] = df[column].progress_apply(lambda x: ','.join([word for word in x if word not in (stop_words)]))
  dfs.append(df)
df_res = pd.concat(dfs, ignore_index=True)


dbrecordid
abstract
lemma
gensim


100%|████████████████████████████████| 100000/100000 [00:01<00:00, 66916.52it/s]


title
lemma
gensim


100%|███████████████████████████████| 100000/100000 [00:00<00:00, 423703.02it/s]


dbrecordid
abstract
lemma
gensim


100%|████████████████████████████████| 100000/100000 [00:01<00:00, 64096.64it/s]


title
lemma
gensim


100%|███████████████████████████████| 100000/100000 [00:00<00:00, 400346.29it/s]


dbrecordid
abstract
lemma
gensim


100%|████████████████████████████████| 100000/100000 [00:01<00:00, 57641.05it/s]


title
lemma
gensim


100%|███████████████████████████████| 100000/100000 [00:00<00:00, 400612.81it/s]


dbrecordid
abstract
lemma
gensim


100%|████████████████████████████████| 100000/100000 [00:01<00:00, 52860.04it/s]


title
lemma
gensim


100%|███████████████████████████████| 100000/100000 [00:00<00:00, 373492.34it/s]


dbrecordid
abstract
lemma
gensim


100%|████████████████████████████████| 100000/100000 [00:01<00:00, 61421.19it/s]


title
lemma
gensim


100%|███████████████████████████████| 100000/100000 [00:00<00:00, 391542.04it/s]


In [7]:
#merge data with averbis class
result = pd.merge(df_res, df_class, on=['dbrecordid'], how='inner')

In [8]:
#remove rows with class 'Rest'
result = result[result['class'] != 'Rest']

In [9]:
#get combined tokens from every column
result['combined'] = result[result.columns[1:2]].parallel_apply(lambda x: ','.join(x.astype(str)) ,axis=1)
result = result.drop(['title',
              'abstract'],axis =1 )
result = result[result["combined"].str.len() > 3]

In [10]:
result

Unnamed: 0,dbrecordid,class,combined
0,M14687872,Medizin,"background,commonly,used,medication,attention,..."
1,M28444815,Medizin,"aim,evaluation,efficacy,endoscopic,method,diag..."
2,M29405844,Medizin,"study,aimed,investigate,effect,bone,morphogene..."
3,M30710046,Medizin,"myeloid,cell,critical,orchestrating,regulated,..."
4,M31960582,Medizin,"introduction,brief,negative,symptom,scale,bnss..."
...,...,...,...
400267,M35717432,ErnÃ¤hrung,"meta,analysis,aimed,compare,effect,bariatric,s..."
400268,M32416796,ErnÃ¤hrung,"study,evaluated,growth,performance,immunity,je..."
400269,AGRICOLAIND605815273,ErnÃ¤hrung,"novel,experimental,data,physicochemical,proper..."
400271,BASE::ftdoajarticles:oai:doaj.org/article:06d5...,ErnÃ¤hrung,"background,postoperative,sore,throat,post,well..."


## train model LDA for different datasizes

In [11]:
#split text into tokens
def to_data(df):
    data=[]
    for row in tqdm(df['combined'].values):
        row = row.split(",")
        data.append(row)
    return data

In [12]:
def count_class_pop(df):
    counted = df['class'].value_counts()
    counted = counted.to_frame()
    counted["population"] = counted['class'].values / len(df)
    counted["pop_perc"] = counted['population'].values * 100
    lowest_c = counted.min()['class']
    return counted , lowest_c

In [13]:
def to_id_corpus(data):
    # Create Dictionary
    id2word = corpora.Dictionary(data)
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data]
    return corpus, id2word

In [14]:
#predict topic 
def get_topic(liste, lda):
    to_pro = []
    corpus, id2w = to_id_corpus(liste)
    topic = lda.get_document_topics(corpus, minimum_probability=0.5, minimum_phi_value=None,
                                   per_word_topics=False)
    for t in topic:
            to_pro.append(t)
    return to_pro

In [15]:
#define all possible combinations of the classes
classes = ['Medizin', 'Landwirtschaft', 'Umweltwissenschaften', 'ErnÃ¤hrung']
topics = [0,1,2,3] 

unique_combinations = []
permut = itertools.permutations(classes, len(topics))

for comb in permut:
    zipped = zip(comb, topics)
    unique_combinations.append(list(zipped))


In [16]:
#replace items in predicted list
def replace_items(pred_list, true_list):
    liste = list(pred_list)
    for i in range(len(liste)):
        for tupel in true_list:
            if liste[i] == str(tupel[1]):
                liste[i] = tupel[0]
    return liste

In [17]:
#create f1-score for every combination
def f1_score_(combinations, pred_list, true_list):
    result = []
    for combi in combinations:
        res = {}
        pred = replace_items(pred_list,combi)
        f1 = metrics.f1_score(true_list, pred, average='weighted')
        res.update({'Combi': combi,'f1_score': f1})
        result.append(res)
    return result

In [18]:
#get highest value of dictionary
def highest_val(lst, key, key2):
    highest_value = None
    for dict in lst:
        score = dict[key]
        combi = dict[key2]  
        if highest_value is None or score > highest_value:
            highest_value = score
            pred_combi = combi
    return highest_value, pred_combi

In [20]:
size = 50000
size2 = 200
while size > 100:
    df_temp = {} 
    res =[]
    df_med = result.loc[result['class'] == "Medizin"].head(int(size))
    df_land = result.loc[result['class'] =='Landwirtschaft'].head(int(size))         
    df_umwelt = result.loc[result['class'] =='Umweltwissenschaften'].head(int(size))  
    df_ern = result.loc[result['class'] =='ErnÃ¤hrung'].head(int(size))            
    df = pd.concat([df_med, df_land,df_umwelt,df_ern])
    #counted, lowest_c = count_class_pop(df)
    #split into train and test data
    df_train, df_test = train_test_split(df, test_size=0.25)
    #get list of keywords
    keywords_train = to_data(df_train)

    # Create Dictionary
    id2word = corpora.Dictionary(keywords_train)
    # Create Corpus
    keywords_str = keywords_train
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in keywords_str]

    # number of topics
    num_topics = 4
    # Build LDA model
    lda_model = ge.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=num_topics
                                        )
    # Print the keywords in the 5 topics
    #pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]

    # get testdata into list
    keywords_test = to_data(df_test)
    #predict topics
    topics = get_topic(keywords_test, lda_model)
    df_test['topic'] = topics
    df_test["topic"] = df_test["topic"].astype("str")
    df_test["topic"] = df_test["topic"].replace(to_replace=r'[^\d|\.|\,]', value='', regex=True)
    df_test["topic"] = df_test["topic"].replace('', np.nan)
    df_test[["topic","certainty"]] =  df_test["topic"].apply(lambda x: pd.Series(str(x).split(",")))

    #get lists of pred and true values
    pred_test = df_test['topic'].values.tolist()
    true_test = df_test['class'].values.tolist()

    #get dictionary for each combination and every score
    res_dict = f1_score_(unique_combinations, pred_test, true_test)
    f1_all = [x['f1_score'] for x in res_dict] 
    f1, combi = highest_val(res_dict,'f1_score','Combi')

    # Compute Coherence Score
    #coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_train, dictionary=id2word, coherence='c_v')
    #coherence_lda = coherence_model_lda.get_coherence()
    # perplexity: a measure of how good the model is. lower the better.

    #perplexity = lda_model.log_perplexity(corpus)

    df_temp.update({'size': size, 'f1_all': f1_all, 'highest_f1_score': f1, 'combi': combi})
    res.append(df_temp)
    res = pd.DataFrame(res)
    res.to_csv('/vol/data/LDA_benchmark/LDA_scores_' + str(size) + '.csv')
    #res.to_sql('evaluation_LDA', engine, schema='ke_stage', if_exists='append', index=False)
    size = size - size2
    #print(size)

100%|████████████████████████████████| 150000/150000 [00:02<00:00, 59365.56it/s]
100%|██████████████████████████████████| 50000/50000 [00:01<00:00, 30406.08it/s]
100%|████████████████████████████████| 149400/149400 [00:02<00:00, 58546.25it/s]
100%|█████████████████████████████████| 49800/49800 [00:00<00:00, 102735.63it/s]
100%|███████████████████████████████| 148800/148800 [00:01<00:00, 108968.77it/s]
100%|█████████████████████████████████| 49600/49600 [00:00<00:00, 104015.72it/s]
100%|███████████████████████████████| 148200/148200 [00:01<00:00, 106359.44it/s]
100%|█████████████████████████████████| 49400/49400 [00:00<00:00, 102512.37it/s]
100%|███████████████████████████████| 147600/147600 [00:01<00:00, 108972.51it/s]
100%|██████████████████████████████████| 49200/49200 [00:00<00:00, 96847.32it/s]
100%|███████████████████████████████| 147000/147000 [00:01<00:00, 105595.22it/s]
100%|██████████████████████████████████| 49000/49000 [00:00<00:00, 98296.27it/s]
100%|███████████████████████

In [None]:
#res = pd.DataFrame(res)
#res.to_csv('/home/ubuntu/ullrich/BA_text_classification/data/LDA_scores_' + str(size2))

In [None]:
#save LDA model
temp_file = datapath('/home/ubuntu/ullrich/data/LDA_model/lda_model_4classes')
lda_model.save(temp_file)

In [None]:
#load LDA model
temp_file = datapath('/home/ubuntu/ullrich/my_code/data/LDA_model/lda_model_määx')
lda = models.ldamodel.LdaModel.load(temp_file)

In [24]:
df_test.to_csv('/home/ubuntu/ullrich/my_code/data/predicted_LDA_määäx.csv', sep=',')

## analysis

In [16]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=keywords_train, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)
print('\nPerplexity: ', lda_model.log_perplexity(corpus)) # a measure of how good the model is. lower the better.

Coherence Score:  0.36468614537058963

Perplexity:  -9.106080092428977


In [17]:
LDAvis_data_filepath = os.path.join('/home/ubuntu/ullrich/my_code/data/ldavis_prepared_keywords_max')

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, LDAvis_data_filepath + 'keywords.html')
LDAvis_prepared

Coherence Score:  0.36468614537058963

Perplexity:  -9.106080092428977

(0.2674848618667972,
 [('Medizin', 0.0),
  ('Landwirtschaft', 1.0),
  ('Umweltwissenschaften', 2.0),
  ('Ernährung', 3.0)])

Coherence Score:  0.33641594383898077

Perplexity:  -7.416605622733863