In [118]:
import string
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction,pipeline,linear_model,metrics
import pickle

In [79]:
data = pd.read_csv("Language_Detection.csv")

In [10]:
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [80]:
data.columns = [c.lower() for c in data.columns]

In [27]:
data.language.value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: language, dtype: int64

## Data Cleaning

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      10337 non-null  object
 1   language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


Text data usualy do not have lots of cleaning, mostly we need to do data pre-proceccing:
1. Tokenization
2. Stop words Removal
3. Lower case conversion
4. Removing numeric/digits
5. Removing Punctuations/Special Characters
6. Removing characters (for foreign languages)
7. Normalization
8. Stemming & Lemmatization

## Pre Processing

In [34]:
for char in string.punctuation:
    print(char, end = " ")
translate_table = dict((ord(char), None) for char in string.punctuation)    

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [77]:
def text_preprocess(data):
    data['text_processed'] = ''
    for i,line in data.iterrows():
        text = line['text']
        if len(text) > 0:
            text = text.lower()
            text = re.sub("\d+","",text)
            text = text.translate(translate_table)
            data['text_processed'][i] = text

    return data


In [81]:
processed_data = text_preprocess(data)

In [82]:
processed_data

Unnamed: 0,text,language,text_processed
0,"Nature, in the broadest sense, is the natural...",English,nature in the broadest sense is the natural p...
1,"""Nature"" can refer to the phenomena of the phy...",English,nature can refer to the phenomena of the physi...
2,"The study of nature is a large, if not the onl...",English,the study of nature is a large if not the only...
3,"Although humans are part of nature, human acti...",English,although humans are part of nature human activ...
4,[1] The word nature is borrowed from the Old F...,English,the word nature is borrowed from the old fren...
...,...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada,ಹೇಗೆ ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎಲ...
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...


In [102]:
X = processed_data.iloc[:,2]
y = processed_data.iloc[:,1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8269,)
(8269,)
(2068,)
(2068,)


## Vectorization and Model Pipeline

In [99]:
vectorization = feature_extraction.text.TfidfVectorizer(ngram_range=(1,3),analyzer='char')

In [107]:
pipe_lr = pipeline.Pipeline([
    ('vectorization',vectorization),
    ('clf',linear_model.LogisticRegression())
])

## Model Fitting

In [108]:
pipe_lr.fit(X_train,y_train)

Pipeline(steps=[('vectorization',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

## Model Prediction

In [113]:
y_pred= pipe_lr.predict(X_test)

## Model Evaluation

In [115]:
acc = metrics.accuracy_score(y_test,y_pred)
print(acc)

0.9893617021276596


## Model Saving

In [119]:
with open ('LRModel.pckl','wb') as file:
    pickle.dump(pipe_lr,file)


## Model Loading 

In [120]:
with open ('LRModel.pckl','rb') as model:
    lr_model = pickle.load(model)

## Test Model

In [134]:
input_text_french = ['Ne me quitte pas by Jacques Brel']
prediction_language = pipe_lr.predict(input_text_french)
print(prediction_language)

['French']


In [136]:
input_text_arabic = ['سأقدم لك أفضل  عبارة إنجليزية يمكنك استخدامها في حياتك اليومية']
prediction_language = pipe_lr.predict(input_text_arabic)
print(prediction_language)

['Arabic']
