In [12]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
import pandas as pd
import numpy as np
import spacy
from collections import Counter

In [13]:
#read in
ch_dir ='swiss_history_books/Der_Geschichten_schweizerischer_Eidgenos.txt'
fr_dir = 'swiss_history_books/Histoire_de_la_Confédération_Suisse-1.txt'
with open(ch_dir) as f:
    schweizer_geschichte = f.read()
    
with open(fr_dir) as f:
    confederation_suisse = f.read()


In [14]:
# preprocessing
pos_to_keep = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB']

def lematization_pos_tagging(tagging_language_funtion, text):
    doc = tagging_language_funtion(text)
    lemmatized = [x.lemma_ for x in doc if x.pos_ in pos_to_keep and x.is_stop != True]
    return lemmatized

#load spacy corpuses
nlp_de = spacy.load('de_core_news_md')
nlp_fr = spacy.load('fr_core_news_sm')

In [15]:
#lemmatize german and french version
schweizer_geschichte = schweizer_geschichte.replace('ſ', 's')#replace ſ with s
schweizer_geschichte_lematized = lematization_pos_tagging(nlp_de, schweizer_geschichte)
schweizer_tf = Counter(schweizer_geschichte_lematized)
confederation_suisse_lemmatized = lematization_pos_tagging(nlp_fr, confederation_suisse)
confederation_tf = Counter(confederation_suisse_lemmatized)



In [55]:
#pre processing with lematization

#print(schweizer_tf)
#print((confederation_tf))
schweizer_tf_top_100 = [(el, schweizer_tf[el])   for  el in schweizer_tf]
schweizer_tf_top_100 = sorted(schweizer_tf_top_100, key=lambda tup: tup[1], reverse=True)
confederation_tf_top_100 = [(el, confederation_tf[el])  for el in confederation_tf]
confederation_tf_top_100 = sorted(confederation_tf_top_100, key=lambda tup: tup[1], reverse=True)


print(schweizer_tf_top_100[:100])
print(confederation_tf_top_100[:100])




[('Regierung', 476), ('Kanton', 337), ('Schweiz', 291), ('Frankreich', 221), ('Truppe', 203), ('Bern', 198), ('Partei', 183), ('lassen', 175), ('Consul', 170), ('ward', 169), ('französisch', 167), ('Land', 163), ('Mann', 157), ('Verfassung', 147), ('fein', 139), ('alt', 138), ('Minister', 136), ('helvetischen', 134), ('Volk', 123), ('geben', 121), ('finden', 119), ('de', 118), ('Verninac', 113), ('Stadt', 107), ('erhalten', 103), ('Ver', 101), ('Helvetien', 94), ('Mitglied', 94), ('halten', 93), ('Min', 92), ('Paris', 91), ('Behörde', 90), ('bringen', 89), ('Wallis', 88), ('Abgeordnete', 85), ('Art', 84), ('Republik', 83), ('stellen', 83), ('Bürger', 83), ('Interesse', 82), ('bleiben', 81), ('Senat', 81), ('verlangen', 79), ('Seite', 79), ('erklären', 79), ('August', 75), ('Consuls', 73), ('Staat', 73), ('Vollz', 72), ('scheinen', 72), ('ziehen', 72), ('helvetische', 72), ('1.', 71), ('Reinhard', 71), ('Beschluß', 71), ('General', 70), ('Mai', 70), ('Gewalt', 70), ('öffentlich', 70), (

In [11]:
#comparison corpus german
from tei_reader import TeiReader
reader = TeiReader()




import os
from xml.dom import minidom
import xml.etree.ElementTree as ET
dir_de_ref = 'swiss_history_books/reference_corpus_german/dta_kernkorpus_1700-1799_2020-07-20/'
dir_de_lemmatized = 'swiss_history_books/reference_corpus_german/dta_kernkorpus_1700-1799_2020-07-20_lemmatized/'

text_corpus_de = []
for files in os.listdir(dir_de_ref)[:50]:
    mydoc = minidom.parse(dir_de_ref+files)
    corpora = reader.read_file(dir_de_ref+files)
    
    ref_text = corpora.text.replace('ſ', 's')#replace ſ with s
    ref_text_cleaned = ' '.join([el for el in ref_text.split() if len(el) > 1])
    
    ref_text_lematized = lematization_pos_tagging(nlp_de, ref_text_cleaned[:1000000])#rate limit at 1 million limit texts
    text_corpus_de.append(ref_text_lematized)
    
    with open(dir_de_lemmatized+files[:-4]+".txt", "w") as text_file:
        text_file.write(' '.join(ref_text_lematized))
    
    
    



In [5]:
#old corpus version 

german_hist_books_enum = [39669, 33090, 31294, 3060, 3061, 3062] # , 50870, 47081
german_hist_book = []

for num in german_hist_books_enum:
    german_hist_book.append([0, strip_headers(load_etext(num)).strip()])

german_hist_book.append([1, schweizer_geschichte])


df = pd.DataFrame(german_hist_book, columns=['swiss', 'text'])


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(df['text'])

df_tfidf = pd.DataFrame(x.toarray())
swiss_tfidf = df_tfidf[df_tfidf.index==6]
swiss_tfidf=swiss_tfidf.T.sort_values(by = 6, ascending=False)

print(swiss_tfidf)
swiss_indexes = swiss_tfidf.index
feature_names = v.get_feature_names()
term_list = [feature_names[i] for i in swiss_indexes]
print(term_list[:100])




              6
12889  0.488124
12605  0.447411
53547  0.321533
64165  0.224139
12519  0.196297
...         ...
26615  0.000000
26616  0.000000
26617  0.000000
26618  0.000000
32267  0.000000

[64535 rows x 1 columns]
['die', 'der', 'und', 'ſich', 'den', 'zu', 'in', 'ſie', 'von', 'des', 'daß', 'für', 'ſo', 'das', 'er', 'iſt', 'an', 'mit', 'dem', 'über', 'eine', 'auf', 'dieſe', 'nicht', 'ein', 'regierung', 'erſten', 'schweiz', 'ſeine', 'durch', 'als', 'dieſer', 'es', 'man', 'um', 'war', 'ver', 'conſul', 'ſein', 'aus', 'im', 'nach', 'ſeiner', 'welche', 'franzöſiſchen', 'erſte', 'helvetiſchen', 'einer', 'ihre', 'bern', 'verfaſſung', 'ſelbſt', 'miniſter', 'ſind', 'einen', 'ich', 'nur', 'bei', 'aber', 'am', 'vom', 'verninac', 'unter', 'gegen', 'de', 'sie', 'ſei', 'be', 'wie', 'dieſes', 'ſeinen', 'min', 'truppen', 'werden', 'helvetien', 'hatte', 'frankreich', 'ge', 'dieſen', 'ihr', 'wallis', 'ten', 'wenn', 'dieſem', 'le', 'noch', '1802', '1801', 'haben', 'un', 'kantone', 'oder', 'einem', 'zw

In [None]:
v.get_feature_names()
print(v.get_feature_names())
#print(v.get_feature_names())