In [128]:
import wordsegment
from bs4 import BeautifulSoup
import requests
import pickle
from tqdm import tqdm, tnrange, tqdm_notebook
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

from googletrans import Translator

In [12]:
import re
from collections import Counter

wordsegment.load()

def tokenize(text):
    pattern = re.compile('[a-zA-Z]+')
    return (match.group(0) for match in pattern.finditer(text))

def pairs(iterable):
    iterator = iter(iterable)
    values = [next(iterator)]
    for value in iterator:
        values.append(value)
        yield ' '.join(values)
        del values[0]

In [13]:
def get_uni_bi_grams(text):
    wordsegment.UNIGRAMS.clear()
    wordsegment.UNIGRAMS.update(Counter(tokenize(text)))

    wordsegment.BIGRAMS.clear()
    wordsegment.BIGRAMS.update(Counter(pairs(tokenize(text))))
    
    unigrams = wordsegment.UNIGRAMS
    uni = list(wordsegment.UNIGRAMS.items())
    uni.sort(key=lambda x: x[1], reverse=True)

    bigrams = wordsegment.BIGRAMS
    big = list(wordsegment.BIGRAMS.items())
    big.sort(key=lambda x: x[1], reverse=True)
    
    return {'dictt': {'uni': unigrams, 'bi': bigrams},
            'listt': {'uni': uni, 'bi': big}}

## 1. English Books

In [15]:
%%time

english_book_links_file_name = 'english_book_links.pkl'

try:
    with open(os.path.join(os.curdir, 'Data', english_book_links_file_name), 'rb') as infile:
        english_book_links = pickle.load(infile)
except:
    english_link = ['http://www.gutenberg.org/ebooks/subject/7072', 
                   'http://www.gutenberg.org/ebooks/subject/7072?start_index=26',
                   'http://www.gutenberg.org/ebooks/subject/3203',
                   'http://www.gutenberg.org/ebooks/subject/3203?start_index=26']
    

    english_book_links = []
    count = 0
    for i1 in tnrange(len(english_link), desc='1st loop'):
        page_link = english_link[i1]
        response = requests.get(page_link)
        soup = BeautifulSoup(response.content, 'html.parser')
        book_links = soup.find_all('li', {'class': 'booklink'})
        #print(len(book_links))
        for i2 in tqdm_notebook(range(len(book_links)), desc='detail', leave=False):
            book_link = book_links[i2]
            real_book_link = 'http://www.gutenberg.org' + book_link.a['href']
            response = requests.get(real_book_link)
            soup = BeautifulSoup(response.content, 'html.parser')
            utf_a = soup.find('a', type=lambda x: x and 'text/plain' in x)

            utf_link = utf_a['href']
            
            english_book_links.append('http:' + utf_link)
            count = count + 1
            #print(str(count) + ' book links added.')
    
    
    with open(os.path.join(os.curdir, 'Data', english_book_links_file_name), 'wb') as outfile:
        pickle.dump(english_book_links, outfile)

HBox(children=(IntProgress(value=0, description='1st loop', max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, description='detail', max=25), HTML(value='')))

HBox(children=(IntProgress(value=0, description='detail', max=11), HTML(value='')))

HBox(children=(IntProgress(value=0, description='detail', max=25), HTML(value='')))

HBox(children=(IntProgress(value=0, description='detail', max=25), HTML(value='')))


Wall time: 1min 42s


In [18]:
english_book_links[:2]

['http://www.gutenberg.org/ebooks/53416.txt.utf-8',
 'http://www.gutenberg.org/ebooks/34089.txt.utf-8']

In [19]:
%%time

english_text_file_name = 'english_text.pkl'

try:
    with open(os.path.join(os.curdir, 'Data', english_text_file_name), 'rb') as infile:
        english_text = pickle.load(infile)

except:
    english_text = ''
    for book_link in tqdm(english_book_links):
        response = requests.get(book_link)
        english_text = english_text + response.text
    
    with open(os.path.join(os.curdir, 'Data', english_text_file_name), 'wb') as outfile:
        pickle.dump(english_text, outfile)

100%|██████████████████████████████████████████| 86/86 [00:34<00:00,  1.99it/s]


Wall time: 35.4 s


In [21]:
%%time

eng_res = get_uni_bi_grams(english_text)

Wall time: 16.7 s


In [35]:
eng_uni = pd.DataFrame(eng_res['listt']['uni'], columns=['word', 'freq'])
eng_bi = pd.DataFrame(eng_res['listt']['bi'], columns=['pair', 'freq'])

## 2. German Books

In [25]:
%%time

german_book_links_file_name = 'german_book_links.pkl'
try:
    with open(os.path.join(os.curdir, 'Data', german_book_links_file_name), 'rb') as infile:
        german_book_links = pickle.load(infile)
except:
    german_link = ['http://www.gutenberg.org/ebooks/subject/18100', 
                   'http://www.gutenberg.org/ebooks/subject/18100?start_index=26',
                   'http://www.gutenberg.org/ebooks/subject/18100?start_index=51']
    

    german_book_links = []
    count = 0
    for i1 in tnrange(len(german_link), desc='1st loop'):
        page_link = german_link[i1]
        response = requests.get(page_link)
        soup = BeautifulSoup(response.content, 'html.parser')
        book_links = soup.find_all('li', {'class': 'booklink'})
        #print(len(book_links))
        for i2 in tqdm_notebook(range(len(book_links)), desc='detail', leave=False):
            book_link = book_links[i2]
            real_book_link = 'http://www.gutenberg.org' + book_link.a['href']
            response = requests.get(real_book_link)
            soup = BeautifulSoup(response.content, 'html.parser')
            utf_a = soup.find('a', type=lambda x: x and 'text/plain' in x)

            utf_link = utf_a['href']
            
            german_book_links.append('http:' + utf_link)
            count = count + 1
            #print(str(count) + ' book links added.')
    
    
    with open(os.path.join(os.curdir, 'Data', german_book_links_file_name), 'wb') as outfile:
        pickle.dump(german_book_links, outfile)

Wall time: 1e+03 µs


In [26]:
german_book_links[:3]

['http://www.gutenberg.org/ebooks/29376.txt.utf-8',
 'http://www.gutenberg.org/ebooks/2403.txt.utf-8',
 'http://www.gutenberg.org/files/53628/53628-0.txt']

In [27]:
%%time

german_text_file_name = 'german_text.pkl'

try:
    with open(os.path.join(os.curdir, 'Data', german_text_file_name), 'rb') as infile:
        german_text = pickle.load(infile)

except:
    german_text = ''
    for book_link in tqdm(german_book_links):
        response = requests.get(book_link)
        german_text = german_text + response.text
    
    with open(os.path.join(os.curdir, 'Data', german_text_file_name), 'wb') as outfile:
        pickle.dump(german_text, outfile)


100%|██████████████████████████████████████████| 60/60 [00:14<00:00,  3.92it/s]


Wall time: 14.5 s


In [28]:
%%time

ger_res = get_uni_bi_grams(german_text)

Wall time: 8.05 s


In [29]:
ger_uni = pd.DataFrame(ger_res['listt']['uni'], columns=['word', 'freq'])
ger_bi = pd.DataFrame(ger_res['listt']['bi'], columns=['pair', 'freq'])

## 3. Change data frame to lower case

In [65]:
eng_uni['word'] = eng_uni['word'].str.lower()
eng_bi['pair'] = eng_bi['pair'].str.lower()
ger_uni['word'] = ger_uni['word'].str.lower()
ger_bi['pair'] = ger_bi['pair'].str.lower()

### some of the words are in both lower and upper case

For example, 'And' and 'and' should be the same, however in the dataframe it is the separated

In [98]:
eng_uni_temp = pd.DataFrame(eng_uni.groupby(['word'])['freq'].sum())
eng_uni_temp.reset_index(inplace=True)
eng_bi_temp = pd.DataFrame(eng_bi.groupby(['pair'])['freq'].sum())
eng_bi_temp.reset_index(inplace=True)

ger_uni_temp = pd.DataFrame(ger_uni.groupby(['word'])['freq'].sum())
ger_uni_temp.reset_index(inplace=True)
ger_bi_temp = pd.DataFrame(ger_bi.groupby(['pair'])['freq'].sum())
ger_bi_temp.reset_index(inplace=True)

In [102]:
eng_uni_temp.sort_values(by=['freq'], inplace=True, ascending=False)
eng_bi_temp.sort_values(by=['freq'], inplace=True, ascending=False)
ger_uni_temp.sort_values(by=['freq'], inplace=True, ascending=False)
ger_bi_temp.sort_values(by=['freq'], inplace=True, ascending=False)

In [124]:
total_freq_eng_uni = eng_uni_temp['freq'].sum()
total_freq_eng_bi = eng_bi_temp['freq'].sum()
total_freq_ger_uni = ger_uni_temp['freq'].sum()
total_freq_ger_bi = ger_bi_temp['freq'].sum()

## 4. Translation between different languages

In [129]:
# initiate a translator
translator = Translator()

In [154]:
count_to_ana = 100

In [155]:
ger_list = list(ger_uni_temp['word'])[:count_to_ana]
ger_translated = translator.translate(ger_list, src='de', dest='en')
ger_translated_text = [i.text.lower() for i in ger_translated]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
print(ger_list[:100])
print(ger_translated_text[:100])

In [None]:
ger_df = pd.DataFrame(ger_translated_text, columns=['word'])
ger_df['freq_ger_to_eng'] = pd.Series(list(ger_uni_temp['freq'])[:count_to_ana])

In [None]:
ger_df.shape

In [None]:
ger_df_grouped = ger_df.groupby(['word']).sum()
ger_df_grouped.reset_index(inplace=True)

In [None]:
ger_df_grouped.shape

In [None]:
ger_df_ori = pd.DataFrame(ger_translated_text, columns=['word'])
ger_df_ori['original german'] = pd.Series(list(ger_uni_temp['word'])[:count_to_ana])
ger_df_ori_grouped = pd.DataFrame(ger_df_ori.groupby(['word'])['original german'].apply(lambda x: ', '.join(x)))
ger_df_ori_grouped.reset_index(inplace=True)

In [None]:
ger_df_ori_grouped.shape

In [None]:
eng_df = eng_uni_temp.loc[0:count_to_ana, ['word', 'freq']]

In [None]:
joined_1_df = ger_df_grouped.merge(eng_df, left_on='word', right_on='word', suffixes=('_ger', '_eng'))
joined_1_df.head()

In [None]:
joined_2_df = joined_1_df.merge(ger_df_ori_grouped, left_on='word', right_on='word', suffixes=('_ger', '_eng'))
joined_2_df.head()

In [None]:
joined_2_df.sort_values(by=['freq'], inplace=True, ascending=False)

In [None]:
joined_2_df['freq_normalized'] = joined_2_df['freq'] / total_freq_eng_uni
joined_2_df['freq_ger_normalized'] = joined_2_df['freq_ger_to_eng'] / total_freq_ger_uni
joined_2_df['ratio'] = joined_2_df['freq_ger_normalized'] / joined_2_df['freq_normalized']

In [None]:
joined_2_df_sorted = joined_2_df.reset_index()
joined_2_df_sorted.head(20)

In [None]:
save_joined_file = 'result.pkl'
with open(os.path.join(os.curdir, 'Data', save_joined_filee), 'wb') as outfile:
    pickle.dump(joined_2_df_sorted, outfile)