# 1 Loading the Data

Language Detection using the European Parliament Proceedings Parallel Corpus. European Parliament Proceedings Parallel Corpus is a text dataset used for evaluating language detection engines. The 1.5GB corpus includes 21 languages spoken in EU. This project aims to build a machine learning model trained on this dataset to predict new unseen data.

The Training data can be downloaded [here](https://www.statmt.org/europarl/#:~:text=starting%20with%20%22%3C%22)

In [143]:
# Import pandas packages
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [106]:
# Import pandas packages
import pandas as pd
def readData(language):
    data_language = pd.read_csv("data\\"+language+".txt", sep="\r")
    data_language['language']=language
    #pick 20 000 rows rendom for every language
    return data_language.sample(n=2000)


In [107]:
data_english=readData("english")
data_spanish=readData("spanish")
data_french=readData("french")
data_italian=readData("italian")
data_german=readData("german")

In [108]:
#concat all the languages into one dataframe
data = pd.concat([data_english,data_spanish,data_french,data_italian,data_german],ignore_index=True)
#shuffle the data and reset the idex
data = data.sample(frac=1,random_state=5).reset_index(drop=True)
data.head(10)

Unnamed: 0,sentence,language
0,"Dietro a questo ""nuovo governo"" , mi sembra si...",italian
1,We would also draw attention to the human side...,english
2,"Señor Presidente, ayer oímos mucho a los orado...",spanish
3,Me estoy refiriendo a la continua tragedia de ...,spanish
4,"Este Comité Militar, como pueden ustedes imagi...",spanish
5,"Fra l'altro, non ci lasciano indifferenti talu...",italian
6,"Darum unterstütze ich, dass es jetzt endlich e...",german
7,Nous espérons que l' on attribuera aux points ...,french
8,"Dado que su autor no está presente, la pregunt...",spanish
9,The EU organs and institutions can exercise fa...,english


# 2 Data pre-processing
Removing noise from data : 
- remove punctuations : [!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]
- remove digits 
- remove uppercase

In [109]:
def removeNoise(data):
    data["sentence"]=data["sentence"].str.replace('[0-9]', ' ',regex=True)
    data["sentence"]=data["sentence"].str.replace('['+string.punctuation+']', ' ',regex=True)
    data["sentence"]=data["sentence"].str.lower()
    return data
removeNoise(data.head(10).copy())
    

Unnamed: 0,sentence,language
0,dietro a questo nuovo governo mi sembra si...,italian
1,we would also draw attention to the human side...,english
2,señor presidente ayer oímos mucho a los orado...,spanish
3,me estoy refiriendo a la continua tragedia de ...,spanish
4,este comité militar como pueden ustedes imagi...,spanish
5,fra l altro non ci lasciano indifferenti talu...,italian
6,darum unterstütze ich dass es jetzt endlich e...,german
7,nous espérons que l on attribuera aux points ...,french
8,dado que su autor no está presente la pregunt...,spanish
9,the eu organs and institutions can exercise fa...,english


# 3 splitting Data into Train and Test sets 

In [110]:
X,y =data["sentence"],data["language"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


# 4 Vetorizer (ID TF) 

In [111]:
vectorizer = TfidfVectorizer(ngram_range=(1,3),analyzer='word')
model= Pipeline([('vectorizer',vectorizer),('clf',LogisticRegression())])

# 4 Model fitting

In [112]:
model.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer(ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

# 5 Result

In [147]:
y_predicted= model.predict(X_test)
accuracyScore = metrics.accuracy_score(y_test,y_predicted)
accurac

0.996

In [148]:
matrix = metrics.confusion_matrix(y_test,y_predicted)
matrix

array([[389,   0,   1,   0,   0],
       [  0, 408,   0,   2,   0],
       [  0,   0, 385,   0,   0],
       [  0,   0,   0, 394,   0],
       [  0,   0,   0,   5, 416]], dtype=int64)

# 6 SAVE and LOAD the model

In [149]:
import pickle
saveModel = open('myModel.pckl','wb')
pickle.dump(model,saveModel)
saveModel.close()


In [4]:
    import string
    import pickle
    import pandas as pd
    #load saved model
    saveModel =open('myModel.pckl','rb')
    model = pickle.load(saveModel)
    saveModel.close()

# 7 Graphical interface 

In [None]:
import tkinter as tk



def languageDetector(inputText):
    import string
    import pickle
    global model
    global resultatFrame
    #load saved model
    saveModel =open('myModel.pckl','rb')
    model = pickle.load(saveModel)
    saveModel.close()
    
    #remove noise
    inputText =inputText.replace('[0-9]', ' ')
    inputText=inputText.replace('['+string.punctuation+']', ' ')
    inputText=inputText.lower()
    
    #predection
    predectedLanguage = model.predict([inputText])
    probability = model.predict_proba([inputText])
    allProb= pd.DataFrame(probability, columns=model.classes_)
    
    #dispaly resultat
    
    for child in resultatFrame.winfo_children():
        child.destroy()

    
    predResult_label = tk.Label(resultatFrame,fg="#12183d",bg='#f5f5f5',font=("Arial", 15),
                           padx=50,pady=50,
                           text="this text is written in "+predectedLanguage)
    predResult_label["text"] = "Total tries: 0"
    predResult_label.pack()

    resultatFrame.pack()
    
    
    

#windows
app = tk.Tk()
app.title("Language Detector")
app.minsize(1020, 700)
width_value = app.winfo_screenwidth()
height_value = app.winfo_screenheight()
app.geometry(str(width_value) + "x" + str(height_value))
app.configure(bg='#f5f5f5')


#frames
middleFrame = tk.Frame(app, background='#f5f5f5', width=700, height=height_value)

title_label = tk.Label(middleFrame,fg="#12183d",bg='#f5f5f5',font=("Arial", 15),padx=50,pady=50, text="Welcome to our Language Detector \n Our model support 5 languages (en,es,fr,it,de)")
title_label.pack()

# Create text widget and specify size.
textArea = tk.Text(middleFrame, height = 10, width = 70)
textArea.pack()

# Create button for detect text.
btnDetect = tk.Button(middleFrame, text = "Detect",bg="#6378ff",fg="white",font=("Arial", 15),pady=5,width = 50 ,command= lambda: languageDetector(textArea.get("1.0",'end-1c')))
btnDetect.pack(pady=10)
#resultat frame
resultatFrame =tk.Frame(middleFrame)








middleFrame.pack()


app.mainloop()















In [9]:
proba=model.predict_proba(["il va dormir"])
pd.DataFrame(proba, columns=model.classes_)

Unnamed: 0,english,french,german,italian,spanish
0,0.06714,0.187097,0.08356,0.590676,0.071527
