In [None]:
# default_exp data

# data

> help function for data exploration and dataset

In [None]:
#hide
from nbdev.showdoc import *
from bs4 import BeautifulSoup
from argostranslate import package, translate
import matplotlib.pyplot as plt
import pandas as pd
from prettytable import PrettyTable
import seaborn as sns
from tqdm import tqdm

In [None]:
#export
default_translator_path = 'translate-en_de-1_0.argosmodel'
def generate_translator(path='translate-en_de-1_0.argosmodel'):
    '''
        path[str]: path to file from https://www.argosopentech.com/argospm/index/
    '''
    package.install_from_path(path)
    installed_languages = translate.get_installed_languages()
    [str(lang) for lang in installed_languages]
    translation_en_de = installed_languages[0].get_translation(installed_languages[1])
    return translation_en_de

By default it will create English to German translator. 

In [None]:
#export
def strip_html(text):
    '''
    Remove html text
    '''
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [None]:
#export
class OriTraTranslation():
    def __init__(self, ori_text, path=default_translator_path):
        '''
        Input:
            ori_text[str]: Original text
            path[str]: path to file from https://www.argosopentech.com/argospm/index
            
        Output: 
            dict[dict]: { ori_text:Original text, translated_text: translated text }
        '''
        self.ori_text = ori_text
        self.translator = generate_translator(path)

    def __len__(self): return len(self.ori_text)

    def __getitem__(self, idx):
        return {
            'ori_text': self.ori_text[idx], 
            'translated_text' : self.translator.translate(self.ori_text[idx])
        }

**Output:**<br> 
    dict[dict]: { ori_text:Original text, translated_text: translated text }
    

This is useful when you don't want to translate all text but check few text.

```
df = pd.DataFrame({'text': ['Hello world!', 'Welcome to year 2022.', 'Hello Alex, where did you born?']})
ori_translate_text = OriTraTranslation(df['text'], default_translator_path)
print(ori_translate_text[0])
print(ori_translate_text[1])
```
```
Output: 
{'ori_text': 'Hello world!', 'translated_text': 'Hallo Welt!'} <br>
{'ori_text': 'Welcome to year 2022.', 'translated_text': 'Willkommen im Jahr 2022.'}
```

In [None]:
#export 
def create_ori_trans_dataframe(text_in, path):
    '''        
    Output: Pandas dataframe: {'text_in': input text, 'trans_text':translated text}
    '''
    ori_sentences = []
    trans_sentences = []
    
    en_de_translation = OriTraTranslation(text_in, path)
    
    for idx in tqdm(range(len(text_in)), desc='Translating text : '):
        en_de = en_de_translation[idx]
        ori_sentences.append(en_de['ori_text'])
        trans_sentences.append(en_de['translated_text'])
    
    df = pd.DataFrame()
    df['ori_text'] = ori_sentences
    df['trans_text'] = trans_sentences
    
    return df

Translate corpus(all text) and return pandas DataFrame with original text and translated text

In [None]:
#export
def print_dataframe_table(df, n=3, schuffle=True):
    '''
    Randomly see n examples
    '''
    if schuffle: df = df.sample(frac=1).reset_index(drop=True)
    ptable = PrettyTable()
    keys = df.keys().to_list()
    ptable.field_names = df.keys().to_list()
    plist = []
    for idx in range(n):
        l = list()
        for key in keys:
            l.append( df[key][idx] )
        plist.append(l)
    ptable.add_rows(plist)
    
    return ptable

This function is useful when we want to compare long texts. (pandas truncate the long text.)