In [122]:
### Agumentation
#### From the profiling of the dataset it is evident that a lot of words are not present in the italian vocabolary
#### For this reason, and because the data is highly unbalanced, agumentation is needed to increase italian datapoints and
####  words in its vocabolary to reduce eventual out-of-vocabolary inputs

##### LOGIC
##### For each [langueage selected] text in the dataset: 
#####   we translate the phrase and check its words
#####   if the words are all present in the italian vocabolary:
#####       we do not insert it in the dataset 
#####       (this is because inserting the phrase will not generate new knowlage)
#####       (We could say that the informative content of the new phrase is 0)
#####   else, 
#####   if there is at least one new word
####        we insert the new phrase

##### The dataset holds 698 italian text and 9639 not italian
##### We performe agumentation in the following way:
##### 1. Agumentation with english phrases from Lang_det_parsed.csv -> Dataset_agumented1.csv
##### 2. Agumentation with french phrases from Dataset_agumented1.csv-> Dataset_agumented2.csv
##### 3. Agumentation with spanish phrases from Dataset_agumented2.csv-> Dataset_agumented3.csv








##### We will use GoogleTranlator api (sorry =) )

from deep_translator import GoogleTranslator
import pandas as pd

In [123]:
dataset=pd.read_csv('..\\..\\Dataset\\Lang_det_parsed.csv')   #the parsed dataset
words=pd.read_csv('..\\..\\Dataset\\words.csv')     #csv holdeing all words in the dataset


#### English agumentation

In [124]:
#we retrive english phrases
english_text=dataset[dataset['Language']=='English']['Text']
english_text.shape


(1385,)

In [125]:
#we retrive all words that are present in both italian vocabolary and not italian vocabolary
#these words are encoded with 2 in the words.csv file
italian_words=set(words[words['where']==2]['Word'])
italian_words

{'a',
 'abstract',
 'academy',
 'acceso',
 'accurate',
 'ad',
 'agosto',
 'ai',
 'al',
 'alan',
 'algoritmo',
 'all',
 'alla',
 'alle',
 'alta',
 'alto',
 'america',
 'amo',
 'an',
 'and',
 'andrea',
 'android',
 'anno',
 'ars',
 'arte',
 'arthur',
 'association',
 'assume',
 'atroce',
 'aumento',
 'austria',
 'automaticamente',
 'award',
 'awards',
 'backpropagation',
 'bancario',
 'barack',
 'basa',
 'base',
 'bayesiana',
 'bbc',
 'bel',
 'bell',
 'ben',
 'biblioteca',
 'bibliotecario',
 'biografia',
 'blue',
 'bomis',
 'bradley',
 'brand',
 'bravo',
 'breiman',
 'britannica',
 'brockhaus',
 'buon',
 'businessweek',
 'bysa',
 'c',
 'california',
 'campana',
 'campo',
 'canada',
 'canto',
 'cara',
 'carne',
 'carta',
 'casa',
 'casi',
 'caso',
 'catch',
 'categoria',
 'categorie',
 'causa',
 'cd',
 'ce',
 'cell',
 'cena',
 'centrale',
 'centro',
 'ceo',
 'certamente',
 'certo',
 'check',
 'cielo',
 'ciffolilli',
 'cima',
 'cinema',
 'circa',
 'cita',
 'classe',
 'clic',
 'click',
 'cl

In [126]:
#For each phrase in eng, we translate it and then see if there is a new word using bloom filter
#pip install bloomfilter-py
from bloomfilter import BloomFilter
bloom_filter = BloomFilter(expected_insertions=len(italian_words)*2, err_rate=0.01)




In [127]:
#translation method

def translate_text(text, tranlator):
    out=tranlator.translate(text)
    return out


In [128]:
#filtering method
#if the word is not in the bloom filter,
#we return true and add it to the bloom filter
def handle_text(word,bloom_filter):
    if word not in bloom_filter:
        bloom_filter.put(word)
    return True


In [129]:
#splitting the english text into the single words
def split_text(text):
    return text.split(' ')

In [130]:
import re
import numpy as np
import string
def parse_text(text):
        if text!=np.nan:
                text=re.sub(r"[!#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]",'',text)  #punctuations
                text = '' .join((z for z in text if not z.isdigit()))
                
                text=re.sub(r'"','',text)
                #special handling of the ' char
                text=re.sub(r"'",' ',text)
                text=re.sub(r'[[]]','',text)
                text=text.lower()
                text=text.strip()
                #second processing of the text
                text=text.translate(str.maketrans('','',string.punctuation))
                return text
        return '0'

In [131]:
#LOGIC
#we iterate all the english text  (A) and translate them in italian (B)
#we check if the the words in (B) there is a word not present in the italian vocabolary
# if B does not contain new words:
#       B is not inserted as a new phrase of the dataset
# else, if B contains at least a new word, then we insert B as a new italian phrase in the dataset
# and update the italian vocabolary

english_text=list(english_text)     # listing all the english phrases
tranlator=GoogleTranslator(source='en',target='it') #initializing the Google translator obj
new_text=[]     #list of all italian new phrases 
for text in english_text:
    translated=translate_text(text,tranlator)   #we translate the phrase (B)
    translated=parse_text(translated)
    words=split_text(translated)        #we split B 
    for w in words:
        if handle_text(w,bloom_filter): #we check if the current word (w) is new italian word or not

            new_text.append(translated)     #if w is a new word (not present in the italian vocabolary) we append it to the list of new
                                            #italian phrases
            break

print(len(new_text))    #we check the number of new phrases computed


1385


In [132]:
#creating a new Dataframe containing the new italian phrases computed
to_insert=[]
for text in new_text:
    number_of_words=len(text.split(' '))
    number_of_spaces=number_of_words-1
    label=1
    lang='Italian'
    entry=[text,lang,label,number_of_words,number_of_spaces]
    to_insert.append(entry)

columns=dataset.columns.to_list()[2:]
columns


['Text', 'Language', 'Class', 'Number of words', 'Number of spaces']

In [133]:
#we concatenate the old dataframe (not agumented) and the new dataframe (computed in the prev. cell)
new_dataset=pd.DataFrame(data=to_insert,columns=columns)
new_dataset=pd.concat([dataset,new_dataset],ignore_index=True,axis=0)

In [134]:
#we check the total of italian phrases present in the dataset used for training and evaluation
italian_entry_numb=new_dataset[new_dataset.Class == 1].shape[0]
italian_entry_numb



2083

In [135]:
italian_entry=new_dataset[new_dataset.Class==1]
italian_entry


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Text,Language,Class,Number of words,Number of spaces
7114,7114.0,7114.0,nature è una delle più antiche ed importanti r...,Italian,1,28,27
7115,7115.0,7115.0,viene pubblicata fin dal novembre,Italian,1,7,6
7116,7116.0,7116.0,nonostante la maggior parte delle riviste del ...,Italian,1,41,40
7117,7117.0,7117.0,molti sono gli avanzamenti e le scoperte prove...,Italian,1,83,82
7118,7118.0,7118.0,il fattore di impatto impact factor di questa ...,Italian,1,17,16
...,...,...,...,...,...,...,...
11717,,,qual è stato il tuo errore ti diamo da mangiar...,Italian,1,18,17
11718,,,narcisa ha cambiato i suoi modi ha lottato all...,Italian,1,41,40
11719,,,come il narcisismo ora marian ha detto a entra...,Italian,1,23,22
11720,,,ha lei immagino che non vorrebbe più pane d or...,Italian,1,14,13


### Checking stats after agumentation with english phrases

In [136]:
#Analyzing the cardinality of italian and not italian datapoints present in the dataset
italian_entry=new_dataset[new_dataset.Class == 1].shape[0]
other=new_dataset.shape[0]-italian_entry
total=new_dataset.shape[0]
print('total',total)
print('Non-italian', other)
print('Italian',italian_entry)

print('Total:',total, '100%')

print('Not italian:',other,int((other/total)*100),'%')
print('Italian:',italian_entry,int((italian_entry/total)*100),'%')

total 11722
Non-italian 9639
Italian 2083
Total: 11722 100%
Not italian: 9639 82 %
Italian: 2083 17 %


In [137]:
#saving the new dataset
new_dataset.to_csv('..\\..\\Dataset\\Dataset_agumented1.csv')

### French agumentation


In [138]:

french_text=dataset[dataset['Language']=='French']['Text']
french_text.shape

(1014,)

In [139]:
french_text=list(french_text)     # listing all the english phrases
tranlator=GoogleTranslator(source='fr',target='it') #initializing the Google translator obj
new_text=[]     #list of all italian new phrases 
for text in french_text:
    translated=translate_text(text,tranlator)   #we translate the phrase (B)
    translated=parse_text(translated)
    words=split_text(translated)        #we split B 
    for w in words:
        if handle_text(w,bloom_filter): #we check if the current word (w) is new italian word or not

            new_text.append(translated)     #if w is a new word (not present in the italian vocabolary) we append it to the list of new
                                            #italian phrases
            break

print(len(new_text))    #we check the number of new phrases computed


1014


In [140]:
#creating a new Dataframe containing the new italian phrases computed
to_insert=[]
for text in new_text:
    number_of_words=len(text.split(' '))
    number_of_spaces=number_of_words-1
    label=1
    lang='Italian'
    entry=[text,lang,label,number_of_words,number_of_spaces]
    to_insert.append(entry)

columns=dataset.columns.to_list()[2:]
columns

['Text', 'Language', 'Class', 'Number of words', 'Number of spaces']

In [141]:
#we concatenate the old dataframe (not agumented) and the new dataframe (computed in the prev. cell)
new_dataset_fr=pd.DataFrame(data=to_insert,columns=columns)
new_dataset=pd.concat([new_dataset_fr,new_dataset],ignore_index=True,axis=0)



In [142]:
#Analyzing the cardinality of italian and not italian datapoints present in the dataset
italian_entry=new_dataset[new_dataset.Class == 1].shape[0]
other=new_dataset.shape[0]-italian_entry
total=new_dataset.shape[0]
print('total',total)
print('Non-italian', other)
print('Italian',italian_entry)

print('Total:',total, '100%')

print('Not italian:',other,int((other/total)*100),'%')
print('Italian:',italian_entry,int((italian_entry/total)*100),'%')

total 12736
Non-italian 9639
Italian 3097
Total: 12736 100%
Not italian: 9639 75 %
Italian: 3097 24 %


In [143]:
new_dataset.to_csv('..\\..\\Dataset\\Dataset_agumented2.csv')

### Spanish agumentation


In [144]:

spanish_text=dataset[dataset['Language']=='Spanish']['Text']
spanish_text.shape

(819,)

In [145]:
spanish_text=list(spanish_text)     # listing all the spaish phrases
tranlator=GoogleTranslator(source='es',target='it') #initializing the Google translator obj
new_text=[]     #list of all italian new phrases 
for text in spanish_text:
    if text!='':
        try:
            translated=translate_text(text,tranlator)   #we translate the phrase (B)
            translated=parse_text(translated)
            words=split_text(translated)        #we split B 
            for w in words:
                if handle_text(w,bloom_filter): #we check if the current word (w) is new italian word or not

                    new_text.append(translated)     #if w is a new word (not present in the italian vocabolary) we append it to the list of new
                                                    #italian phrases
                    break
        except:
            break

print(len(new_text))    #we check the number of new phrases computed


277


In [146]:
print('Tot spanish phrase:',len(spanish_text))
print('Tot new italian phrases:',len(new_text))

Tot spanish phrase: 819
Tot new italian phrases: 277


In [147]:
#creating a new Dataframe containing the new italian phrases computed
to_insert=[]
for text in new_text:
    number_of_words=len(text.split(' '))
    number_of_spaces=number_of_words-1
    label=1
    lang='Italian'
    entry=[text,lang,label,number_of_words,number_of_spaces]
    to_insert.append(entry)

columns=dataset.columns.to_list()[2:]
columns

['Text', 'Language', 'Class', 'Number of words', 'Number of spaces']

In [148]:
#we concatenate the old dataframe (not agumented) and the new dataframe (computed in the prev. cell)
new_dataset_sp=pd.DataFrame(data=to_insert,columns=columns)
new_dataset=pd.concat([new_dataset_sp,new_dataset],ignore_index=True,axis=0)

In [149]:
#Analyzing the cardinality of italian and not italian datapoints present in the dataset
italian_entry=new_dataset[new_dataset.Class == 1].shape[0]
other=new_dataset.shape[0]-italian_entry
total=new_dataset.shape[0]
print('total',total)
print('Non-italian', other)
print('Italian',italian_entry)

print('Total:',total, '100%')

print('Not italian:',other,int((other/total)*100),'%')
print('Italian:',italian_entry,int((italian_entry/total)*100),'%')

total 13013
Non-italian 9639
Italian 3374
Total: 13013 100%
Not italian: 9639 74 %
Italian: 3374 25 %


In [150]:
new_dataset.to_csv('..\\..\\Dataset\\Dataset_agumented3.csv')

### NOTES
#### The process of agumentation con be done also using other languages (german, ...). This will increase the number of italian phrase reducing the effect of training with unbalanced data.

#### As the stats show 
#### English agumentation 6%->17% (+1385)
#### French aguementation 17%->24% (+1014)
#### Spanish agumentation 24%->25% (+277)
#### The agumentation process has increased the number of italian phrases from 6% to 25%
#### Better then before.
#### This process could be extented to other languages such as Portoguese, German, Russian
### Other options
#### Rather then translating the existing not italian phrases, we could use transforms to compute for each italina phrases a semantically similar phrase



