# Translating DataFrames

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import deepl

## Deepl API

In [29]:
# def read_yaml(filepath):
#     with open(filepath, 'r') as f: 
#         return yaml.safe_load(f)

# config =  read_yaml('../config.yaml')

In [50]:
translator = deepl.Translator('f18d00ef-8c6f-9521-fe63-770be4d69a49:fx')

# Testing: 
result = translator.translate_text("Hello, world!", target_lang="FR")
print(result.text)

## Translation

In [None]:
def translate(col):
    '''
    Translates columns to english: 
    Input: Column to be translated
    Output: Returns a list of translated column    
    '''
    trans = []
    for idx, val in enumerate(col):
        trans.append(translator.translate_text(val, target_lang="EN-US"))    
    return trans
        

#### 19grams

In [52]:
grams = pd.read_csv('../data_clean/19grams.csv')

In [70]:
# Harmonizing Column Names:
grams.rename(columns = {'coffee_variety': 'variety',
                       'review': 'flavour'}, inplace=True)

# Tanslating columns and adding translated columns to df
grams['flavour_en'] = translate(grams['flavour'])
grams['origin_en'] = translate(grams['coffee_country'])

# Picking columns relevant for the Model: 
df = grams[['coffee_title', 'origin_en', 'flavour_en']]
df.columns = ['coffee', 'origin', 'flavour'] # Renaming columns

#df.to_csv('../data_flav_optimized_en/19grams.csv', index=True)

#### coffee friend

In [72]:
file = pd.read_csv('../data_clean/coffee_friend.csv')

In [84]:
# Tanslating columns and adding translated columns to df
file['flavour_en'] = translate(file.review_text)
file['coffee'] = translate(file.coffee_title)

# Picking columns relevant for the Model: 
df = file[['coffee', 'coffee_country', 'coffee_region', 'variety', 'production', 'roast', 'flavour_en']]

# Renaming columns
df.rename(columns={'coffee_country': 'origin',
                  'coffee_region': 'region',
                  'production': 'process',
                  'flavour_en': 'flavour'}, inplace=True)


In [85]:
#df.to_csv('../data_flav_optimized_en/coffee_friends.csv', index=True)

#### Kaffeezentral

In [87]:
file = pd.read_csv('../data_clean/kaffeezentrale.csv')

In [94]:
# Tanslating columns and adding translated columns to df
file['flavour_en'] = translate(file.review_text)
file['coffee'] = translate(file.coffee_title)

# Picking columns relevant for the Model: 
df = file[['coffee', 'flavour_en']]

# Renaming columns
df.columns = ['coffee', 'flavour']

In [95]:
#df.to_csv('../data_flav_optimized_en/kaffeezentrale.csv', index=True)

#### Kaffeothek

In [325]:
file = pd.read_csv('../data_clean/kaffeothek.csv')

# manually correcting translation error
file.loc[file.roast_level == 'Hell', 'roast_level'] = 'leicht' 

# Tanslating columns and adding translated columns to df
file['origin'] = translate(file.coffee_origin_country)
file['flavour'] = translate(file.review_text)
file['roast'] = translate(file.roast_level)

# manually correcting translation error
file.loc[file.roast == 'easy', 'roast'] = 'light'

# Picking columns relevant for the Model: 
df = file[['coffee_title', 'origin', 'coffee_variety', 'roast', 'flavour']]

In [326]:
## PROC was scrapped later in the data cleaning process: 
## therefore the 2 dataframes (df, proc) need to be harmonized and concatenated:

In [329]:
proc = pd.read_csv('../data_raw/othek_process.csv')

# Dropping columns from the new df that are not representated in the original df: 

proc.drop(2, axis=0, inplace=True)
proc.drop(42, axis=0, inplace=True)
proc.drop(proc[proc.coffee == 'Suchan, Santos'].index, axis=0, inplace=True)
proc.drop(proc[proc.coffee == 'Stoll, Honduras, Bio'].index, axis=0, inplace=True)

proc.coffee = [i.replace('\n', '').strip() for i in proc.coffee]

# Checking if same Coffees are represented in both dfs:
list(zip(proc.coffee.tolist(), df.coffee_title.tolist()))

In [331]:
proc.process= proc.process.fillna('unknown')
proc.process = [i.replace('\n', '').strip() for i in proc.process]
proc = proc.reset_index()

# Replacing df column with new df column because it carries more information:
df['coffee'] = proc.coffee

#dropping odl column:
df.drop('coffee_title', axis=1, inplace=True)

In [332]:
proc.region.value_counts()

\nwashed                                                                     12
\nnatural                                                                     2
Finca Hartmann                                                                2
Enciso                                                                        1
\nfermentiert                                                                 1
Fazienda Passeio, Adolfo Henrique Vieira Ferreira                             1
Fazenda Pedra Preta                                                           1
La Serrania                                                                   1
Aman Adinew                                                                   1
Justin Musabyiama                                                             1
Cooperative San Carlos                                                        1
Finca Lovaina                                                                 1
Bedessa Washing Station                 

In [336]:
# Extracting all PROCESS terms from REGION:
ml = ['natural, washed', 'fermentiert, natural', 'washed, naturla', 'fermentiert', 'natural', 'washed']
proc.region = proc.region.fillna('unknown')
proc.region = [val.replace('\n', '').strip().lower() for val in proc.region]

# Including the terms in PROCESS column:
for idx, i in enumerate(proc.region):
    proc['process'][idx] = np.where(i in ml, i, proc['process'][idx])

proc.reg = proc.reg.fillna('unknown')
proc.reg = [val.replace('\n', '').strip().lower() for val in proc.reg]
for idx, i in enumerate(proc.reg):
    proc['process'][idx] = np.where(i in ml, i, proc['process'][idx])

#Reduction PROCESS to washed, natural, anaearob, unknown
proc['process'] = np.where(proc.process == 'natural, washed', 'natural', proc.process)
proc['process'] = np.where(proc.process == 'washed, natural', 'washed', proc.process)
proc['process'] = np.where(proc.process == 'semi-washed', 'washed', proc.process)
proc['process'] = np.where(proc.process == 'anaerobic fermentiert', 'anaerob', proc.process)
proc['process'] = np.where(proc.process == 'washed, entkoffeiniert', 'washed', proc.process)
proc['process'] = np.where(proc.process == 'fermentiert, natural', 'anaerob', proc.process)
proc['process'] = np.where(proc.process == 'fermentiert', 'anaerob', proc.process)
proc['process'] = np.where(proc.process == 'Sugar Cane Decaf', 'unknown', proc.process)

# Stripping values:
proc['process'] = [str(i).strip() for i in proc.process]

# Creating final df:
df_new = df[['coffee', 'origin', 'coffee_variety', 'roast', 'flavour']]
df_new['process'] = proc.process
df_new.rename(columns={'coffee_variety': 'variety'})

In [338]:
# Manually translating a left out value:
df_new.origin[0] = 'Indonesia, Brazil, India'

df_new.head()

In [323]:
#df_new.to_csv('../data_flav_optimized_en/kaffeothek_inkl.csv', index=True)