In [7]:
import pandas as pd
import numpy as np
import re
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline


In [15]:
data = pd.read_csv('../data/language_detection.csv')

In [5]:
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


### Separating variable from target

In [16]:
X = data['Text']
y = data['Language']

### Encoding the target

In [17]:
le = LabelEncoder()

y = le.fit_transform(y)

In [19]:
y

array([3, 3, 3, ..., 9, 9, 9])

In [18]:
# These are the classes encoded from the target

le.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

### Cleaning the Data

In this step the text is submitted to a process of cleaning to remove some undesirable character and lowering the text

In [20]:
data_cleaned = []

for text in X:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    data_cleaned.append(text)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [23]:
# Creating a bag of words using countvectorizer

cv = CountVectorizer()
cv.fit(X_train)

x_train = cv.transform(X_train).toarray()
x_test = cv.transform(X_test).toarray()

In [24]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [25]:
y_pred = model.predict(x_test)
y_pred

array([12,  8,  3, ...,  2,  6,  4])

In [26]:
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [27]:
print("Accuracy is :",ac)

Accuracy is : 0.9825918762088974


### Building a Pipeline for the model using sklearn pipeline

In [28]:
pipe = Pipeline(
    [
        ('vectorizer', cv),
        ('multinomialNB', model)
    ]
)

pipe.fit(X_train, y_train)

In [29]:
y_pred2 = pipe.predict(X_test)
ac2 = accuracy_score(y_test, y_pred2)

print("Accuracy is:", ac2)

Accuracy is: 0.9825918762088974


In [34]:
with open('../models/trained_pipeline-0.1.0.pkl','wb') as f:
    pickle.dump(pipe, f)

### Testing the model

In [32]:
list = ["Hello, how are you?", "Ciao, come stai?"]

for text in list:


    y = pipe.predict([text])
    print(le.classes_[y[0]], y)

English [3]
Italian [8]
