

## Step 1: Reading the files from individual folders as part of data extraction

In [1]:
import glob
import csv
import pandas as pd
import re

def dataExtraction(path):
    list_of_files = glob.glob(path)
    text_add=[]
  
    for fileName in list_of_files:
        data = open( fileName, encoding='utf8')
        data=data.read()
        text= data.split(' ') # Extracting words from text files
        for i in text:
            if i!='ID' and i!='NAME' and i!='SPEAKER' and i!='CHAPTER' and i!='LANGUAGE' and  i!='<P>':
                text_add.append(i)
    return(text_add) # Returns a list of all words in a given text file

## Step 2: Removing noise from Training data

This includes removing punctuations and digits from the extracted text.

In [149]:
def removeNoise(data):
    data[0]= data[0].str.replace('\s', ' ')
    data[0]= data[0].str.replace('ID', ' ')
    data[0]= data[0].str.replace('NAME', ' ')
    data[0]= data[0].str.replace('SPEAKER', ' ')
    data[0]= data[0].str.replace('CHAPTER', ' ')
    data[0]= data[0].str.replace('LANGUAGE', ' ')
    data[0]= data[0].str.replace('<P>', ' ')
    data[0]= data[0].str.replace('/', ' ')
    data[0]= data[0].str.replace('[=<>":-;.,\(\)]', ' ')
    data[0]= data[0].str.replace('[0-9]', ' ')
    data[0]= data[0].str.strip()
    data = data[data[0] != '']
    data= data.drop_duplicates()
    data=data.dropna()
    return(data)


## Step 3: Removing noise from Test data

We ensure that we treat the test data exactly like we treated the training data to remove characters such as punctuations and digits which add noise to test data

In [3]:
def removeNoiseTest(data):
    data[1]= data[1].str.replace('\s', ' ')
    data[1]= data[1].str.replace('ID', ' ')
    data[1]= data[1].str.replace('NAME', ' ')
    data[1]= data[1].str.replace('SPEAKER', ' ')
    data[1]= data[1].str.replace('CHAPTER', ' ')
    data[1]= data[1].str.replace('LANGUAGE', ' ')
    data[1]= data[1].str.replace('<P>', ' ')
    data[1]= data[1].str.replace('/', ' ')
    data[1]= data[1].str.replace('[=<>":-;.,\(\)]', ' ')
    data[1]= data[1].str.replace('[0-9]', ' ')
    data[1]= data[1].str.strip()
    data = data[data[1] != '']
    data= data.drop_duplicates()
    data=data.dropna()
    return(data)

## Language Detection

List of all the languages whose detection is supported:
- 'bg': Bulgarian
- 'cs': Czech
- 'da': Danish
- 'de': German
- 'el': Greek, Modern 
- 'en': English
- 'es': Spanish
- 'et': Estonian
- 'fi': Finnish
- 'fr': French
- 'hu': Hungarian
- 'it': Italian
- 'lt': Lithuanian
- 'lv': Latvian
- 'nl': Dutch
- 'pl': Polish
- 'pt': Portuguese
- 'ro': Romanian
- 'sk': Slovak
- 'sl': Slovenian
- 'sv': Swedish

There are therefore, 21 categorical variables that our classifier needs to be able to identify correctly.

In [4]:
# List of labels to be classified

labels=['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','nl','pl','pt','ro','sk','sl','sv'] 

## Step 4: Function used to join all words held in a dataframe for a given language and place these words in a list

In [8]:
def listToText(Series): 
    Series_list= Series[0].tolist()
    Series_str= ' '.join(Series_list)
    return(Series_str)

### Implementing Step 1 and Step 2 on raw text files to obtain Training data

In [9]:
path_bg= '~/startup_ml/europarl/txt/bg/*.txt'
a1= dataExtraction(path_bg)

In [10]:
b1= pd.DataFrame(a1)

In [147]:
data_bg= removeNoise(b1)
data_bg.head(1)

Unnamed: 0,0
1,Състав


In [12]:
path_cs= '~/startup_ml/europarl/txt/cs/*.txt'
a2= dataExtraction(path_cs)

In [13]:
b2= pd.DataFrame(a2)

In [14]:
data_cs= removeNoise(b2)
data_cs.head(1)

Unnamed: 0,0
1,Schválení


In [15]:
path_da= '~/startup_ml/europarl/txt/da/*.txt'
a3= dataExtraction(path_da)

In [16]:
b3= pd.DataFrame(a3)

In [17]:
data_da= removeNoise(b3)
data_da.head(1)

Unnamed: 0,0
1,Genoptagelse


In [18]:
path_de= '~/startup_ml/europarl/txt/de/*.txt'
a4= dataExtraction(path_de)

In [19]:
b4= pd.DataFrame(a4)

In [20]:
data_de= removeNoise(b4)
data_de.head(1)

Unnamed: 0,0
1,Wiederaufnahme


In [21]:
path_el= '~/startup_ml/europarl/txt/el/*.txt'
a5= dataExtraction(path_el)

In [22]:
b5= pd.DataFrame(a5)

In [23]:
data_el= removeNoise(b5)
data_el.head(1)

Unnamed: 0,0
1,Επαvάληψη


In [24]:
path_en= '~/startup_ml/europarl/txt/en/*.txt'
a6= dataExtraction(path_en)

In [25]:
b6= pd.DataFrame(a6)

In [26]:
data_en= removeNoise(b6)
data_en.head(1)

Unnamed: 0,0
1,Resumption


In [27]:
path_es= '~/startup_ml/europarl/txt/es/*.txt'
a7= dataExtraction(path_es)

In [28]:
b7= pd.DataFrame(a7)

In [29]:
data_es= removeNoise(b7)
data_es.head(1)

Unnamed: 0,0
1,Reanudación


In [30]:
path_et= '~/startup_ml/europarl/txt/et/*.txt'
a8= dataExtraction(path_et)

In [31]:
b8= pd.DataFrame(a8)

In [32]:
data_et= removeNoise(b8)
data_et.head(1)

Unnamed: 0,0
1,Eelmise


In [33]:
path_fi= '~/startup_ml/europarl/txt/fi/*.txt'
a9= dataExtraction(path_fi)

In [34]:
b9= pd.DataFrame(a9)

In [35]:
data_fi= removeNoise(b9)
data_fi.head(1)

Unnamed: 0,0
1,Istuntokauden


In [36]:
path_fr= '~/startup_ml/europarl/txt/fr/*.txt'
a10= dataExtraction(path_fr)

In [37]:
b10= pd.DataFrame(a10)

In [38]:
data_fr= removeNoise(b10)
data_fr.head(1)

Unnamed: 0,0
1,Reprise


In [39]:
path_hu= '~/startup_ml/europarl/txt/hu/*.txt'
a11= dataExtraction(path_hu)

In [40]:
b11= pd.DataFrame(a11)

In [41]:
data_hu= removeNoise(b11)
data_hu.head(1)

Unnamed: 0,0
1,Az


In [42]:
path_it= '~/startup_ml/europarl/txt/it/*.txt'
a12= dataExtraction(path_it)

In [43]:
b12= pd.DataFrame(a12)

In [44]:
data_it= removeNoise(b12)
data_it.head(1)

Unnamed: 0,0
1,Ripresa


In [45]:
path_lt= '~/startup_ml/europarl/txt/lt/*.txt'
a13= dataExtraction(path_lt)

In [46]:
b13= pd.DataFrame(a13)

In [47]:
data_lt= removeNoise(b13)
data_lt.head(1)

Unnamed: 0,0
1,Ankstesnio


In [48]:
path_lv= '~/startup_ml/europarl/txt/lv/*.txt'
a14= dataExtraction(path_lv)

In [49]:
b14= pd.DataFrame(a14)

In [50]:
data_lv= removeNoise(b14)
data_lv.head(1)

Unnamed: 0,0
1,Sēdē


In [51]:
path_nl= '~/startup_ml/europarl/txt/nl/*.txt'
a15= dataExtraction(path_nl)

In [52]:
b15= pd.DataFrame(a15)

In [53]:
data_nl= removeNoise(b15)
data_nl.head(1)

Unnamed: 0,0
1,Hervatting


In [54]:
path_pl= '~/startup_ml/europarl/txt/pl/*.txt'
a171= dataExtraction(path_pl)

In [55]:
b171= pd.DataFrame(a171)

In [56]:
data_pl= removeNoise(b171)
data_pl.head(1)

Unnamed: 0,0
1,Zatwierdzenie


In [57]:
path_pt= '~/startup_ml/europarl/txt/pt/*.txt'
a17= dataExtraction(path_pt)

In [58]:
b17= pd.DataFrame(a17)

In [59]:
data_pt= removeNoise(b17)
data_pt.head(1)

Unnamed: 0,0
1,Reinício


In [60]:
path_ro= '~/startup_ml/europarl/txt/ro/*.txt'
a18= dataExtraction(path_ro)

In [61]:
b18= pd.DataFrame(a18)

In [62]:
data_ro= removeNoise(b18)
data_ro.head(1)

Unnamed: 0,0
1,Componenţa


In [63]:
path_sk= '~/startup_ml/europarl/txt/sk/*.txt'
a19= dataExtraction(path_sk)

In [64]:
b19= pd.DataFrame(a19)

In [65]:
data_sk= removeNoise(b19)
data_sk.head(1)

Unnamed: 0,0
1,Schválenie


In [66]:
path_sl= '~/startup_ml/europarl/txt/sl/*.txt'
a20= dataExtraction(path_sl)

In [67]:
b20= pd.DataFrame(a20)

In [68]:
data_sl= removeNoise(b20)
data_sl.head(1)

Unnamed: 0,0
1,Sprejetje


In [69]:
path_sv= '~/startup_ml/europarl/txt/sv/*.txt'
a21= dataExtraction(path_sv)

In [70]:
b21= pd.DataFrame(a21)

In [71]:
data_sv= removeNoise(b21)
data_sv.head(1)

Unnamed: 0,0
1,Återupptagande


### Implementing Step 4

In [77]:
data_bg_str= listToText(data_bg)

In [78]:
data_cs_str= listToText(data_cs)

In [79]:
data_da_str= listToText(data_da)

In [80]:
data_de_str= listToText(data_de)

In [81]:
data_el_str= listToText(data_el)

In [82]:
data_en_str= listToText(data_en)

In [83]:
data_es_str= listToText(data_es)

In [84]:
data_et_str= listToText(data_et)

In [85]:
data_fi_str= listToText(data_fi)

In [86]:
data_fr_str= listToText(data_fr)

In [87]:
data_hu_str= listToText(data_hu)

In [88]:
data_it_str= listToText(data_it)

In [89]:
data_lt_str= listToText(data_lt)

In [90]:
data_lv_str= listToText(data_lv)

In [91]:
data_nl_str= listToText(data_nl)

In [92]:
data_pl_str= listToText(data_pl)

In [93]:
data_pt_str= listToText(data_pt)

In [94]:
data_ro_str= listToText(data_ro)

In [95]:
data_sk_str= listToText(data_sk)

In [96]:
data_sl_str= listToText(data_sl)

In [97]:
data_sv_str= listToText(data_sv)

In [98]:
t= [data_bg_str, data_cs_str, data_da_str, data_de_str, data_el_str, data_en_str, data_es_str, data_et_str, data_fi_str, data_fr_str, data_hu_str, data_it_str, data_lt_str, data_lv_str, data_nl_str, data_pl_str, data_pt_str, data_ro_str, data_sk_str, data_sl_str, data_sv_str]

## Step 5: Converting the list 't', of all lists of strings, from every language, into a single dataframe. 

This dataframe "trainingData" is the Training data that we train our model on.


In [99]:
trainingData=pd.DataFrame(t)

In [182]:
trainingData

Unnamed: 0,0
0,Състав на Парламента вж протоколи Одобряване п...
1,Schválení zápisu z předchozího zasedání viz zá...
2,Genoptagelse af sessionen Formanden Jeg erkl...
3,Wiederaufnahme der Sitzungsperiode Die Präside...
4,Επαvάληψη της συvσδoυ Πρόεδρος Κηρύσσω την ε...
5,Resumption of the session President I declar...
6,Reanudación del período de sesiones La Preside...
7,Eelmise istungi protokolli kinnitamine vaata P...
8,Istuntokauden uudelleenavaaminen Puhemies Juli...
9,Reprise de la session La Présidente Je décla...


## Step 6: Building the model and training it with the training data

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

### Note:
#### Character frequency analysis is undertaken using a logistic regressing model. Bi-gram model of word pairs is considered. We use a pipeline to implement this model and we use all CPU cores to build the model. L2 regularization is used to prevent overfitting of the model to training data. The inverse of regularization strength "C" is set to 1.0. Thereby, the model generalizes to new unseen data.




In [150]:

text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), analyzer='char')), ('tfidf', TfidfTransformer(use_idf=False)), ('lrg', LogisticRegression(n_jobs=-1))])


In [151]:
basicmodel = text_clf.fit(trainingData[0], labels)

### Implementing Step 3, preprocessing the Test data 

In [152]:
datatest = pd.read_csv('~/startup_ml/europarl-test/europarl.csv', sep='\t',header=None)

In [153]:
datatest[1]= datatest[1].str.replace('\(.*?\)','')
testData= removeNoiseTest(datatest)

In [154]:
testData.head(5)

Unnamed: 0,0,1
0,bg,Европа не трябва да стартира нов конкурен...
1,bg,Най-голямата несправедливост на сегашната обща...
2,bg,Г-жо председател г-н член на Комисията по пр...
3,bg,Г-н председател бих искал да започна с комент...
4,bg,Г-н председател въпросът за правата на човека...


## Step 7: Using the trained model to make predictions on the Test data

In [155]:
prediction= basicmodel.predict(testData[1])
prediction

array(['bg', 'bg', 'bg', ..., 'sv', 'hu', 'pt'], 
      dtype='<U2')

## Analysis of result

The crosstab below shows us the false positives and false negatives that gives us some insight into correlation between languages. "P" stands for Predicted values and "A" stands for Actual values in the crosstab. 
- 502 strings in Slovak where missclassified as Czech. This points at the two languages being highly correlated. This makes sense since Czech Republic and Slovakia have a shared history contributing to similartes between the two languages spoken in this region. 

Similarly the following prominent trends emerged:
- 170 Spanish strings were missclassified as Portuguese
- 73 Italian strings were missclassified as Romanian
- 57 Swedish strings were missclassified as Danish
- 53 Italian strings were missclassified as Portuguese
- 43 Danish strings were missclassified as Dutch

In [177]:
CT=pd.crosstab(testData[0], prediction, rownames=["A"], colnames=["P"], margins=True)
CT

P,bg,cs,da,de,el,en,es,et,fi,fr,...,lt,lv,nl,pl,pt,ro,sk,sl,sv,All
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bg,997,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,997
cs,0,970,0,0,0,0,1,2,1,0,...,3,0,1,3,1,1,1,2,1,993
da,0,2,892,8,0,6,0,1,2,1,...,0,2,43,0,1,3,0,0,24,994
de,0,0,10,951,0,7,0,2,1,0,...,3,1,15,0,0,0,1,0,1,993
el,0,0,0,0,988,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,988
en,0,14,2,22,0,890,3,6,3,4,...,2,1,27,5,8,2,0,0,1,998
es,0,4,2,15,0,3,716,1,1,35,...,4,0,18,0,170,15,0,1,0,996
et,0,4,3,8,0,0,2,823,93,0,...,30,0,14,0,0,1,0,10,2,993
fi,0,1,0,2,0,0,0,4,983,0,...,1,0,2,0,1,0,0,0,1,995
fr,0,2,4,10,0,3,6,3,1,929,...,4,0,17,0,1,13,1,1,0,999


In [184]:
CT['hu']

A
bg        0
cs        6
da        9
de        1
el        0
en        4
es        1
et        2
fi        0
fr        2
hu      988
it        0
lt        1
lv        0
nl        3
pl        0
pt        0
ro        0
sk        3
sl        2
sv       22
All    1044
Name: hu, dtype: int64

## Analysis of Model

In [160]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

print (classification_report(testData[0], prediction))
print (accuracy_score(testData[0], prediction))

             precision    recall  f1-score   support

         bg       1.00      1.00      1.00       997
         cs       0.62      0.98      0.76       993
         da       0.90      0.90      0.90       994
         de       0.89      0.96      0.92       993
         el       1.00      1.00      1.00       988
         en       0.96      0.89      0.92       998
         es       0.93      0.72      0.81       996
         et       0.95      0.83      0.88       993
         fi       0.88      0.99      0.93       995
         fr       0.94      0.93      0.93       999
         hu       0.95      0.99      0.97       998
         it       0.96      0.79      0.87       996
         lt       0.91      0.96      0.94       995
         lv       0.98      0.98      0.98       978
         nl       0.83      0.94      0.88       999
         pl       0.98      0.99      0.98       997
         pt       0.79      0.94      0.86       996
         ro       0.88      0.96      0.92   

### Prediction accuracy of 90.344% was achieved on test data using a model trained with logistic regression 

# Inference

The European Parliment corpus is sizable at about 5 GB. This would seem to be a data at scale problem requiring Big Data analysis. However, by means of sampling we can execute the language detection classifier on a regular PC using the SciPy stack. Around 26.5 MB of text files were randomly selected for each language and analysis was carried out for this subsample (~554 MB) of the 5 GB corpus. We are able to still get a model accuracy of 90.344%, with the model generalizing to new unseen data. 