In [29]:
### Agumentation
#### From the Profiling it is evident that a lot of words are not present in the italian vocabolary
#### For this reason, and because the data is highly unbalanced, agumentation is needed to increase italian datapoints and
#### increase the words in its vocabolary

##### LOGIC
##### For each english text in the dataset: 
##### we translate the phrase and see its words
##### if the words are all present in the italian vocabolary, we do not insert it in the dataset
##### else, we insert the new phrase

##### The dataset holds 698 italian text and 9639 not italian


##### We will use GoogleTranlator api

from deep_translator import GoogleTranslator
import pandas as pd

In [57]:
dataset=pd.read_csv('..\\Dataset\\Lang_det_parsed.csv')   #the parsed dataset
words=pd.read_csv('..\\Dataset\\words.csv')     #csv holdeing all words in the dataset


In [58]:
#we retrive english text for now
english_text=dataset[dataset['Language']=='English']['Text']
english_text.shape


(1385,)

In [59]:
italian_words=set(words[words['where']==2]['Word'])
italian_words

{'&',
 "'",
 '-',
 '-.',
 '.',
 '.la',
 '[...]',
 ']',
 'a',
 'abstract',
 'academy',
 'acceso',
 'accurate',
 'ad',
 'agosto',
 'ai',
 'al',
 'alan',
 'algoritmo',
 'alla',
 'alle',
 'alta',
 'alto',
 'america',
 'amo',
 'an',
 'and',
 'andrea',
 'android.',
 'anno',
 'ars',
 'arthur',
 'assume',
 'atroce',
 'atroce.',
 'aumento',
 'austria',
 'automaticamente',
 'award',
 'awards',
 'bancario',
 'barack',
 'basa',
 'base',
 'bayesiana',
 'bbc',
 'bel',
 'ben',
 'biblioteca',
 'bibliotecario',
 'biografia',
 'blue',
 'bomis',
 'bradley',
 'brand',
 'bravo',
 'breiman',
 'britannica',
 'britannica.',
 'brockhaus',
 'buon',
 'businessweek',
 'by-sa',
 "c't",
 'c.',
 'california',
 'california.',
 'campana',
 'campo',
 'canto',
 'cara',
 'carne',
 'carta',
 'casa',
 'casa.',
 'casi',
 'caso',
 'catch-',
 'catch-.',
 'categoria',
 'categorie',
 'causa',
 'cd',
 'ce',
 'cell',
 'cena',
 'centrale',
 'centro',
 'ceo',
 'certamente',
 'certo',
 'check',
 'cielo',
 'ciffolilli',
 'cima',
 'ci

In [60]:
#For each phrase in eng, we translate it and then see if there is a new word using bloom filter
#pip install bloomfilter-py
from bloomfilter import BloomFilter
bloom_filter = BloomFilter(expected_insertions=len(italian_words)*2, err_rate=0.01)




In [63]:
#translation method

def translate_text(text, tranlator):
    out=tranlator.translate(text)
    return out


In [64]:
#filtering method
#if the word is not in the bloom filter,
#we return true and add it to the bloom filter
def handle_text(word,bloom_filter):
    if word not in bloom_filter:
        bloom_filter.put(word)
    return True


In [65]:
#splitting the english text
def split_text(text):
    return text.split(' ')

In [66]:
english_text=list(english_text)
tranlator=GoogleTranslator(source='en',target='it')
new_text=[]
for text in english_text:
    translated=translate_text(text,tranlator)
    words=split_text(translated)
    for w in words:
        if handle_text(w,bloom_filter):
            new_text.append(translated)
            break

print(len(new_text))


1385


In [73]:
#inserting the new italian text in the dataset
to_insert=[]
for text in new_text:
    number_of_words=len(text.split(' '))
    number_of_spaces=number_of_words-1
    label=1
    lang='Italian'
    entry=[text,lang,label,number_of_words,number_of_spaces]
    to_insert.append(entry)

columns=dataset.columns.to_list()[1:]
columns


['Text', 'Language', 'Class', 'Number of words', 'Number of spaces']

In [74]:
new_dataset=pd.DataFrame(data=to_insert,columns=columns)
new_dataset=pd.concat([dataset,new_dataset],ignore_index=True,axis=0)

In [77]:
#after agumetation the italian entrys are the following
italian_entry_numb=new_dataset[new_dataset.Class == 1].shape[0]
italian_entry_numb



2083

In [79]:
italian_entry=new_dataset[new_dataset.Class==1]
italian_entry


Unnamed: 0.1,Unnamed: 0,Text,Language,Class,Number of words,Number of spaces
7114,7114.0,nature è una delle più antiche ed importanti r...,Italian,1,27,26
7115,7115.0,viene pubblicata fin dal novembre .,Italian,1,7,6
7116,7116.0,nonostante la maggior parte delle riviste del ...,Italian,1,41,40
7117,7117.0,molti sono gli avanzamenti e le scoperte prove...,Italian,1,82,81
7118,7118.0,il fattore di impatto impact factor di questa ...,Italian,1,16,15
...,...,...,...,...,...,...
11717,,qual è stato il tuo errore vieni ti diamo da m...,Italian,1,19,18
11718,,"narcisa ha cambiato i suoi modi, all'inizio ha...",Italian,1,40,39
11719,,com'è il narcisismo ora Marian ha detto a entr...,Italian,1,22,21
11720,,ha lei immagino che non vorrebbe più pane d'or...,Italian,1,13,12


In [80]:
#saving the new dataset
new_dataset.to_csv('..\\Dataset\\Dataset_agumented1.csv')