In [37]:
import wordsegment
from bs4 import BeautifulSoup
import requests
import pickle
from tqdm import tqdm
import pandas as pd
from googletrans import Translator

In [26]:
import re
from collections import Counter

wordsegment.load()

def tokenize(text):
    pattern = re.compile('[a-zA-Z]+')
    return (match.group(0) for match in pattern.finditer(text))

def pairs(iterable):
    iterator = iter(iterable)
    values = [next(iterator)]
    for value in iterator:
        values.append(value)
        yield ' '.join(values)
        del values[0]

In [27]:
def get_uni_bi_grams(text):
    wordsegment.UNIGRAMS.clear()
    wordsegment.UNIGRAMS.update(Counter(tokenize(text)))

    wordsegment.BIGRAMS.clear()
    wordsegment.BIGRAMS.update(Counter(pairs(tokenize(text))))
    
    unigrams = wordsegment.UNIGRAMS
    uni = list(wordsegment.UNIGRAMS.items())
    uni.sort(key=lambda x: x[1], reverse=True)

    bigrams = wordsegment.BIGRAMS
    big = list(wordsegment.BIGRAMS.items())
    big.sort(key=lambda x: x[1], reverse=True)
    
    return {'dictt': {'uni': unigrams, 'bi': bigrams},
            'listt': {'uni': uni, 'bi': big}}

English Books

In [18]:
english_book_links = ['https://www.gutenberg.org/ebooks/1342.txt.utf-8',
         'http://www.gutenberg.org/files/1342/1342.txt',
         'http://www.gutenberg.org/files/158/158.txt',
         'http://www.gutenberg.org/cache/epub/161/pg161.txt',
         'http://www.gutenberg.org/files/121/121-0.txt',
         'http://www.gutenberg.org/files/141/141-0.txt',
         'http://www.gutenberg.org/cache/epub/946/pg946.txt',
         'http://www.gutenberg.org/files/1212/1212-0.txt']

In [19]:
eng_text = ''
for book_link in tqdm(english_book_links):
    response = requests.get(book_link)
    eng_text = eng_text + response.text

100%|████████████████████████████████████████████| 8/8 [00:03<00:00,  3.23it/s]


In [23]:
eng_res = get_uni_bi_grams(eng_text)

In [28]:
eng_uni = pd.DataFrame(eng_res['listt']['uni'])
eng_bi = pd.DataFrame(eng_res['listt']['bi'])

In [32]:
eng_uni.head()

Unnamed: 0,0,1
0,the,28695
1,to,27585
2,of,24749
3,and,24422
4,I,14758


In [30]:
eng_bi.head()

Unnamed: 0,0,1
0,of the,3405
1,to be,2931
2,in the,2637
3,I am,1938
4,of her,1702


German Books

In [9]:
%%time

german_book_links_file_name = 'german_book_links.pkl'
try:
    with open(german_book_links_file_name, 'rb') as infile:
        german_book_links = pickle.load(infile)
except:
    german_link = ['http://www.gutenberg.org/ebooks/subject/18100', 
                   'http://www.gutenberg.org/ebooks/subject/18100?start_index=26',
                   'http://www.gutenberg.org/ebooks/subject/18100?start_index=51']
    

    german_book_links = []
    count = 0
    for page_link in german_link:
        response = requests.get(page_link)
        soup = BeautifulSoup(response.content, 'html.parser')
        book_links = soup.find_all('li', {'class': 'booklink'})
        print(len(book_links))
        for book_link in book_links:
            real_book_link = 'http://www.gutenberg.org' + book_link.a['href']
            response = requests.get(real_book_link)
            soup = BeautifulSoup(response.content, 'html.parser')
            utf_a = soup.find('a', type=lambda x: x and 'text/plain' in x)

            utf_link = utf_a['href']
            
            german_book_links.append('http:' + utf_link)
            count = count + 1
            print(str(count) + ' book links added.')
    
    
    with open(german_book_links_file_name, 'wb') as outfile:
        pickle.dump(german_book_links, outfile)

Wall time: 1 ms


In [10]:
german_book_links[:3]

['http://www.gutenberg.org/ebooks/29376.txt.utf-8',
 'http://www.gutenberg.org/ebooks/2403.txt.utf-8',
 'http://www.gutenberg.org/files/53628/53628-0.txt']

In [33]:
ger_text = ''
for book_link in tqdm(german_book_links):
    response = requests.get(book_link)
    ger_text = ger_text + response.text

100%|██████████████████████████████████████████| 60/60 [00:16<00:00,  2.84it/s]


In [34]:
ger_res = get_uni_bi_grams(ger_text)

In [45]:
ger_uni = pd.DataFrame(ger_res['listt']['uni'], columns=['word', 'freq'])
ger_bi = pd.DataFrame(ger_res['listt']['bi'], columns=['pair', 'freq'])

In [46]:
ger_uni.head()

Unnamed: 0,word,freq
0,und,107374
1,die,77401
2,der,74911
3,in,48830
4,er,44699


In [47]:
ger_bi.head()

Unnamed: 0,pair,freq
0,f r,8468
1,in der,7088
2,Project Gutenberg,4937
3,und die,4212
4,in die,4197


Translation between different languages

In [38]:
# initiate a translator
translator = Translator()


In [39]:
x1 = translator.translate('안녕하세요.')
x2 = translator.translate('안녕하세요.', dest='ja')
x3 = translator.translate('veritas lux mea', src='la')

In [50]:
ger_50 = translator.translate(list(ger_uni['word'])[:50], dest='en')

In [52]:
[i.text for i in ger_50]

['and',
 'the',
 'the',
 'in',
 'is',
 'to',
 'the',
 'the',
 'you',
 'Not',
 'yourself',
 'With',
 'I',
 'the',
 'gives',
 'on',
 'from',
 'on',
 'is',
 'so',
 'war',
 'you',
 'an',
 'is',
 'but',
 'as',
 'also',
 'when',
 'a',
 'f',
 'still',
 'r',
 'of the',
 'him',
 'would have',
 'just',
 'ber',
 'one',
 'h',
 'im',
 'him',
 'w',
 'the',
 'me',
 'was',
 'out',
 'one',
 'their',
 'his',
 'to']