In [120]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import tensorflow as tf
import transformers as trans
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('../data/Language_Detection.csv')

In [3]:
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
data.shape

(10337, 2)

In [5]:
data.Language.unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [6]:
for ln in data['Language'].unique():
    print(ln,data[data['Language'] == ln].shape[0])

English 1385
Malayalam 594
Hindi 63
Tamil 469
Portugeese 739
French 1014
Dutch 546
Spanish 819
Greek 365
Russian 692
Danish 428
Italian 698
Turkish 474
Sweedish 676
Arabic 536
German 470
Kannada 369


In [7]:
le = LabelEncoder()

In [8]:
data['target'] = le.fit_transform(data['Language'])

In [9]:
X = data['Text']
y = data['target']

In [10]:
tfidf = TfidfVectorizer()

In [11]:
X_new = tfidf.fit_transform(X)

In [176]:
X_train,X_test,y_train,y_test = train_test_split(X_new,y,test_size=0.2,random_state=42)

In [188]:
nb = MultinomialNB()

In [189]:
nb.fit(X_train,y_train)

In [190]:
pred = nb.predict(X_test)

In [191]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        97
           1       0.86      0.98      0.92        64
           2       0.94      1.00      0.97       104
           3       1.00      0.77      0.87       376
           4       0.99      0.94      0.97       230
           5       0.94      1.00      0.97        87
           6       0.88      1.00      0.94        60
           7       0.40      1.00      0.57         4
           8       0.97      1.00      0.98       140
           9       0.95      1.00      0.98        63
          10       0.98      1.00      0.99       118
          11       0.95      1.00      0.98       137
          12       0.92      1.00      0.96       125
          13       0.97      0.97      0.97       160
          14       0.98      0.96      0.97       136
          15       0.99      1.00      0.99        86
          16       0.77      1.00      0.87        81

    accuracy              

### Prediction

In [192]:
tamil_text = ["اكتب في العربية"]
try_text = tfidf.transform(tamil_text).toarray()

In [193]:
p = nb.predict(try_text)

In [194]:
le.inverse_transform(p)

array(['Arabic'], dtype=object)

In [195]:
with open("classes.txt",'w') as file:
    for ln in le.classes_:
        file.write(f"{ln}\n")

### RNN - LSTM and GRU

In [121]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [122]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000,oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [123]:
train_seq =  tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

In [124]:
train_pad = tf.keras.utils.pad_sequences(train_seq,padding="post",truncating="post",maxlen=40)
test_pad = tf.keras.utils.pad_sequences(test_seq,padding="post",truncating="post",maxlen=40)

In [208]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(5000,32))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dense(128,activation="relu"))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(len(le.classes_),activation="softmax"))

In [209]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 32)          160000    
                                                                 
 bidirectional_4 (Bidirectio  (None, None, 128)        49664     
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 128)               16512     
                                                                 
 dropout_8 (Dropout)         (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 17)               

In [210]:
model.compile("adam","categorical_crossentropy",["acc"])

In [211]:
y_train_hot = tf.keras.utils.to_categorical(y_train)
y_test_hot = tf.keras.utils.to_categorical(y_test)

In [213]:
history = model.fit(train_pad,y_train_hot,epochs=10,validation_data=(test_pad,y_test_hot))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [214]:
pred = model.predict(test_pad)



In [215]:
y_pred = np.argmax(pred,axis=1)
y_true = np.argmax(y_test_hot,axis=1)

In [216]:
print(classification_report(y_pred,y_true))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        96
           1       0.89      0.90      0.90        72
           2       0.91      0.89      0.90       114
           3       0.97      0.98      0.98       288
           4       0.91      0.99      0.95       202
           5       0.92      0.97      0.95        89
           6       0.93      0.98      0.95        64
           7       1.00      0.83      0.91        12
           8       0.88      0.94      0.91       135
           9       0.91      0.97      0.94        62
          10       0.98      0.64      0.77       184
          11       0.92      0.99      0.96       134
          12       0.93      0.95      0.94       133
          13       0.94      0.89      0.92       169
          14       0.94      0.98      0.96       128
          15       0.91      0.89      0.90        89
          16       0.80      0.87      0.83        97

    accuracy              

In [169]:
new_word = ["اكتب في العربي"]
new_seq = tokenizer.texts_to_sequences(new_word)
new_pad = tf.keras.utils.pad_sequences(new_seq,padding="post",truncating="post",maxlen=40)

In [170]:
p = model.predict(new_pad)



In [171]:
new_p = np.argmax(p,axis=1)

In [172]:
new_p

array([0])

In [173]:
for i in new_p:
    print(le.classes_[i])

Arabic


In [226]:
model.save("LangLSTM.h5")

In [175]:
import pickle

In [198]:
pickle.dump(nb, open("LangDetct.pkl", 'wb'))

In [204]:
check_pm = pickle.load(open("LangDetct.pkl",'rb'))
check_pm.predict(try_text)

array([0])