In [74]:
import pandas as pd

import langid
from langdetect import detect
import pycld2 as cld2
import fasttext
from googletrans import Translator

# 1. Dataset

In [78]:
comments=pd.DataFrame({'comments':
                      ['the service is very slow',
                       'bahut bura lag raha hai',
                       'staff ne help nahi kiya',
                       'accha service mila',
                       'ghanta bhi help nahi kiya',
                       'all the time late rehte hain',
                       'disappointing experience',
                       'hoping for improvement'],
                       
                      'language':
                      ['English','Hindi','Hindi','Hindi','Hindi','Hindi','English','English']})
comments

Unnamed: 0,comments,language
0,the service is very slow,English
1,bahut bura lag raha hai,Hindi
2,staff ne help nahi kiya,Hindi
3,accha service mila,Hindi
4,ghanta bhi help nahi kiya,Hindi
5,all the time late rehte hain,Hindi
6,disappointing experience,English
7,hoping for improvement,English


# 2. Testing five different libraries for language detection

## 2.1 langid

In [80]:
def detect_langid(text):
    return langid.classify(text)[0]

comments['langid']=comments['comments'].apply(detect_langid)
comments

Unnamed: 0,comments,language,langid
0,the service is very slow,English,en
1,bahut bura lag raha hai,Hindi,ms
2,staff ne help nahi kiya,Hindi,eu
3,accha service mila,Hindi,es
4,ghanta bhi help nahi kiya,Hindi,eu
5,all the time late rehte hain,Hindi,en
6,disappointing experience,English,en
7,hoping for improvement,English,en


## 2.2 langdetect

In [83]:
def detect_langdetect(text):
    try:
        return detect(text)
    except:
        return 'error'

comments['langdetect']=comments['comments'].apply(detect_langdetect)
comments

Unnamed: 0,comments,language,langid,langdetect
0,the service is very slow,English,en,en
1,bahut bura lag raha hai,Hindi,ms,id
2,staff ne help nahi kiya,Hindi,eu,tr
3,accha service mila,Hindi,es,it
4,ghanta bhi help nahi kiya,Hindi,eu,tl
5,all the time late rehte hain,Hindi,en,en
6,disappointing experience,English,en,fr
7,hoping for improvement,English,en,en


## 2.3 pycld2

In [86]:
def detect_pycld2(text):
    try:
        _, _, details = cld2.detect(text)
        return details[0][1]
    except:
        return 'error'

comments['pycld2']=comments['comments'].apply(detect_pycld2)
comments

Unnamed: 0,comments,language,langid,langdetect,pycld2
0,the service is very slow,English,en,en,en
1,bahut bura lag raha hai,Hindi,ms,id,un
2,staff ne help nahi kiya,Hindi,eu,tr,un
3,accha service mila,Hindi,es,it,un
4,ghanta bhi help nahi kiya,Hindi,eu,tl,un
5,all the time late rehte hain,Hindi,en,en,en
6,disappointing experience,English,en,fr,en
7,hoping for improvement,English,en,en,en


## 2.4 fasttext

In [89]:
# pip install fasttext-wheel

# download the Pretrained Language Identification Model
# lid.176.bin model file (trained on 176 languages)
# link: https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

In [91]:
model_path=r'C:\Users\Neha Rana\OneDrive\Documents\lid.176.bin'
fasttext_model=fasttext.load_model(model_path)

def detect_fasttext(text):
    pred=fasttext_model.predict(text)
    return pred[0][0].replace('__label__', '')

comments['fasttext']=comments['comments'].apply(detect_fasttext)
comments

Unnamed: 0,comments,language,langid,langdetect,pycld2,fasttext
0,the service is very slow,English,en,en,en,en
1,bahut bura lag raha hai,Hindi,ms,id,un,tr
2,staff ne help nahi kiya,Hindi,eu,tr,un,en
3,accha service mila,Hindi,es,it,un,en
4,ghanta bhi help nahi kiya,Hindi,eu,tl,un,en
5,all the time late rehte hain,Hindi,en,en,en,en
6,disappointing experience,English,en,fr,en,en
7,hoping for improvement,English,en,en,en,en


## 2.5 googletrans

In [94]:
translator=Translator()

def detect_googletrans(text):
    try:
        return translator.detect(text).lang
    except:
        return 'error'

comments['googletrans']=comments['comments'].apply(detect_googletrans)
comments

Unnamed: 0,comments,language,langid,langdetect,pycld2,fasttext,googletrans
0,the service is very slow,English,en,en,en,en,en
1,bahut bura lag raha hai,Hindi,ms,id,un,tr,hi
2,staff ne help nahi kiya,Hindi,eu,tr,un,en,hi
3,accha service mila,Hindi,es,it,un,en,hi
4,ghanta bhi help nahi kiya,Hindi,eu,tl,un,en,hi
5,all the time late rehte hain,Hindi,en,en,en,en,hi
6,disappointing experience,English,en,fr,en,en,en
7,hoping for improvement,English,en,en,en,en,en


In [95]:
# 100% accuracy in language detection by googletrans!

# googletrans is essentially like a tool that connect us to Google Translate’s web service,
# which is powered by a DL-based Neural Machine Translation (NMT) model

# we must sure the access to a stable network for its effective work

# 3. Testing googletrans for language translation

In [97]:
comments_non_en=comments[comments['googletrans']!='en']
comments_non_en=comments_non_en.drop(['langid','langdetect','pycld2','fasttext'],axis=1)
comments_non_en

Unnamed: 0,comments,language,googletrans
1,bahut bura lag raha hai,Hindi,hi
2,staff ne help nahi kiya,Hindi,hi
3,accha service mila,Hindi,hi
4,ghanta bhi help nahi kiya,Hindi,hi
5,all the time late rehte hain,Hindi,hi


In [101]:
def translate_to_english(text):
    try:
        # Translate to English (src will auto-detect the language)
        return translator.translate(text, src='auto', dest='en').text
    except Exception as e:
        return f"Error: {str(e)}"

comments_non_en['translated_comments']=comments_non_en['comments'].apply(translate_to_english)
comments_non_en

Unnamed: 0,comments,language,googletrans,translated_comments
1,bahut bura lag raha hai,Hindi,hi,feeling so bad
2,staff ne help nahi kiya,Hindi,hi,Staff did not help
3,accha service mila,Hindi,hi,Got good service
4,ghanta bhi help nahi kiya,Hindi,hi,Hour not help
5,all the time late rehte hain,Hindi,hi,Keep taking all the time


In [103]:
# we observe an 80% accuracy in translation

# the sentence, “ghanta bhi help nahi kiya”, which was incorrectly translated to “hour not help”
# this occurred because “ghanta”, which in this context is a sarcastic expression commonly used in Hindi, was interpreted literally as “hour”

# sarcasm, idioms, and slang perhaps remain the weak spots even for DL-based models,
# especially when the expressions lack clear analogues in English or do not frequently appear in training corpora