<a href="https://colab.research.google.com/github/mido-Jr/Language-Detection-Using-NLP/blob/main/Language_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download Dataset From Kaggle API

In [1]:
# %%script echo skipping
def kaggle_API():
  if getattr(kaggle_API, 'has_run', False):
        return
  kaggle_API.has_run = True
  from google.colab import files
  files.upload() # Browse for the kaggle.json file that you downloaded

  # Make directory named kaggle, copy kaggle.json file there, and change the permissions of the file.
  ! mkdir ~/.kaggle
  ! cp kaggle.json ~/.kaggle/
  ! chmod 600 ~/.kaggle/kaggle.json

  # You can check if everything's okay by running this command.
  #! kaggle datasets list

  # Download and unzip #Dataset  into '/usr/local' using API command usning --unzip
  ! kaggle datasets download -d basilb2s/language-detection --unzip


In [2]:
kaggle_API()

Saving kaggle.json to kaggle.json
Downloading language-detection.zip to /content
  0% 0.00/542k [00:00<?, ?B/s]
100% 542k/542k [00:00<00:00, 35.5MB/s]


Dataset `language-detection` contains text details for 17 different languages

## Importing libaries

In [3]:
import numpy as np
import pandas as pd
import re # From the standard library



In [4]:
# Preview the dataset
data = pd.read_csv('/content/Language Detection.csv')
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [5]:
# let`s count the value count for each language
data['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [6]:
data['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

## Separating features


In [7]:
# Separating the feature and target
X = data['Text']
y = data['Language']



## Label Encoding

In [8]:
# the target variable is categorical , we should have convert it
# to numerical form 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)


## Text Preprocessing

In [9]:
# you should clean your dataset , and remove all unwanted symbols
# also this will affect the quality of your model
data_list = []
# loop over the Text
for text in X:

  # removing the symbols and numbers
  text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]','',text)
  text = re.sub(r'[[]]', ' ', text)
  text = text.lower()
  data_list.append(text)

  if __name__ == '__main__':


## CountVectorizer words

In [10]:
# convert input feature to numerical form 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()
print(X.shape)
len(data_list)

(10337, 38665)


10337

## train test split

In [11]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size = .2 , random_state = 41)

## Model training and Prediction

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [13]:
y_pred = model.predict(X_test)

In [14]:
# Model Evaluation
from sklearn.metrics import accuracy_score , confusion_matrix
ac = accuracy_score(y_test , y_pred)


In [15]:
print(f'Accuracy = {ac:.2f}')

Accuracy = 0.98


In [16]:
# Test model prediction using text in different languages
def prediction(text):
  x = cv.transform([text]).toarray()
  lang = model.predict(x)
  lang = le.inverse_transform(lang)
  print('The Language is in', lang[0])

In [19]:
# last test
prediction("توفر  معرفية قائمة على المجتمع لمحترفي التحليلات وعلوم البيانات")


The Language is in Arabic


In [20]:
prediction("это портал знаний на базе сообщества для профессионалов в области аналитики и данных.")

The Language is in Russian
