In [None]:
from google.colab import drive
drive.mount("/content/MyDrive")


Drive already mounted at /content/MyDrive; to attempt to forcibly remount, call drive.mount("/content/MyDrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from sklearn.preprocessing import normalize

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [None]:
df=pd.read_csv("/content/MyDrive/MyDrive/dataset.csv")
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [None]:
langs=set(df['language'].values)
print(langs)
print("-------------------------------------------------------------")
print('Swedish & English:', df['Text'][1])
print('Thai & English:', df['Text'][2])
print('Chinese & English:', df['Text'][21998])

{'Indonesian', 'Russian', 'Romanian', 'Latin', 'Japanese', 'Spanish', 'Swedish', 'Turkish', 'Urdu', 'Thai', 'Pushto', 'Persian', 'Hindi', 'Tamil', 'Korean', 'Arabic', 'Dutch', 'Estonian', 'Portugese', 'Chinese', 'English', 'French'}
-------------------------------------------------------------
Swedish & English: sebes joseph pereira thomas  på eng the jesuits and the sino-russian treaty of nerchinsk  the diary of thomas pereira bibliotheca instituti historici s i --   rome libris 
Thai & English: ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เริ่มตั้งแต่ถนนสนามไชยถึงแม่น้ำเจ้าพระยาที่ถนนตก กรุงเทพมหานคร เป็นถนนรุ่นแรกที่ใช้เทคนิคการสร้างแบบตะวันตก ปัจจุบันผ่านพื้นที่เขตพระนคร เขตป้อมปราบศัตรูพ่าย เขตสัมพันธวงศ์ เขตบางรัก เขตสาทร และเขตบางคอแหลม
Chinese & English: 年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由美國的獨立廠牌bip·record發行，以外國輸入盤的形式在日本發售，旋即被抢购一空。其後於月日發行以倉木麻衣名義發行的首張日文單曲《love day after tomorrow》，正式於日本出道。這張單曲初動銷量只得約萬張，可是其後每週銷量一直上升，並於年月正式突破百萬銷量，合计万张。成為年最耀眼的新人歌手。


In [None]:
X=df['Text']
y=df['language']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

17600
4400
17600
4400


In [None]:
unigramVectorizer = CountVectorizer(analyzer='char', ngram_range=(1,1))
x_unigram_train_raw = unigramVectorizer.fit_transform(x_train)
x_unigram_test_raw = unigramVectorizer.transform(x_test)


unigramFeatures = unigramVectorizer.get_feature_names_out()

print('Number of unigrams in training set:', len(unigramFeatures))

Number of unigrams in training set: 6816


In [None]:
unigramFeatures

array([' ', '"', '$', ..., '𢍰', '𢙯', '𧣾'], dtype=object)

In [None]:
def train_lang_dict(X_raw_counts, y_train):
    lang_dict = {}
    for i in range(len(y_train)):
        lang = y_train[i]
        v = np.array(X_raw_counts[i])
        if not lang in lang_dict:
            lang_dict[lang] = v
        else:
            lang_dict[lang] += v

    for lang in lang_dict:
        v = lang_dict[lang]
        lang_dict[lang] = v / np.sum(v)

    return lang_dict

language_dict_unigram = train_lang_dict(x_unigram_train_raw.toarray(), y_train.values)

def getRelevantCharsPerLanguage(features, language_dict, significance=1e-5):
    relevantCharsPerLanguage = {}
    for lang in langs:
        chars = []
        relevantCharsPerLanguage[lang] = chars
        v = language_dict[lang]
        for i in range(len(v)):
            if v[i] > significance:
                chars.append(features[i])
    return relevantCharsPerLanguage

relevantCharsPerLanguage = getRelevantCharsPerLanguage(unigramFeatures, language_dict_unigram)

for lang in langs:
    print(lang, len(relevantCharsPerLanguage[lang]))

Indonesian 75
Russian 85
Romanian 109
Latin 132
Japanese 2054
Spanish 66
Swedish 77
Turkish 121
Urdu 142
Thai 148
Pushto 188
Persian 98
Hindi 117
Tamil 113
Korean 1407
Arabic 97
Dutch 58
Estonian 96
Portugese 64
Chinese 3249
English 60
French 70


As we can see in the above overview, following languages are using a lot of unique symbols:

Chinese: 3,249
Japanese: 2,054
Korean: 1,407
I.e. we can easily identify these languages by using Uni-Grams.

But:

All other languages are using much fewer symbols.
And most of the other languages share common symbols.

so here we will use Bi-grams

In [None]:
bigramVectorizer = CountVectorizer(analyzer='char', ngram_range=(2,2))
x_bigram_raw = bigramVectorizer.fit_transform(x_train)
bigramFeatures = bigramVectorizer.get_feature_names_out()
print('Number of bigrams', len(bigramFeatures))

Number of bigrams 147219


### Mixture of Uni-Gram & Bi-Grams
When we restrict ourselves to a limited number of features, it is important, that we will capture details for each language. Since Chinese consists of >3,000 of different symbols, the probability of the most frequently used Chinese Uni-Grams might be below the top 1000 used Bi-Grams of the other languages.

#### Mixture Uni- & Bi-Grams (using the top 1%)
So, take Uni- & Bi-Grams occurring at least in 1% of all cases.

In [None]:
top1PrecentMixtureVectorizer = CountVectorizer(analyzer='char', ngram_range=(1,2), min_df=1e-2)
x_top1Percent_train_raw = top1PrecentMixtureVectorizer.fit_transform(x_train)
x_top1Percent_test_raw = top1PrecentMixtureVectorizer.transform(x_test)

language_dict_top1Percent = train_lang_dict(x_top1Percent_train_raw.toarray(), y_train.values)

top1PercentFeatures = top1PrecentMixtureVectorizer.get_feature_names_out()
print('Length of features', len(top1PercentFeatures))
print('')

#Unique features per language
relevantChars_Top1Percent = getRelevantCharsPerLanguage(top1PercentFeatures, language_dict_top1Percent, 1e-5)
for lang in relevantChars_Top1Percent:
    print("{}: {}".format(lang, len(relevantChars_Top1Percent[lang])))

Length of features 3079

Indonesian: 496
Russian: 536
Romanian: 532
Latin: 557
Japanese: 897
Spanish: 528
Swedish: 546
Turkish: 597
Urdu: 838
Thai: 719
Pushto: 980
Persian: 664
Hindi: 572
Tamil: 517
Korean: 726
Arabic: 680
Dutch: 527
Estonian: 624
Portugese: 542
Chinese: 716
English: 488
French: 518


In [None]:
def normalizeData(train, test):
    train_result = normalize(train, norm='l2', axis=1, copy=True, return_norm=False)
    test_result = normalize(test, norm='l2', axis=1, copy=True, return_norm=False)
    return train_result, test_result
x_top1Percent_train, x_top1Percent_test = normalizeData(x_top1Percent_train_raw, x_top1Percent_test_raw)

In [None]:
x_top1Percent_train.shape

(17600, 3079)

In [None]:
x_train.shape

(17600,)

In [None]:
clf = MultinomialNB()
clf.fit(x_top1Percent_train, y_tran)
y_predict = clf.predict(x_top1Percent_test)

In [None]:
ac=accuracy_score(y_test,y_predict)
cm=confusion_matrix(y_test,y_predict)
cr=classification_report(y_test,y_predict)
print(ac)
print(cm)
print(cr)

0.9736363636363636
[[202   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0 195   0   5   0   0   0   1   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0 221   2   0   5   0   0   0   0   1   0   0   0   0   0   1   0
    0   0   0   0]
 [  0   0   0 193   0   0   0   0   0   0   1   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   6 189   1   0   0   0   0   2   0   0   0   0   2   0   0
    0   0   0   0]
 [  0   0   0   1   0 186   0   0   0   0   1   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   3   0   0 205   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   1   7   0   0   0 205   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   1   0   0   1   0   0 190   0   0   0   1   0   0   1   0   0
    0   0   0   0]
 [  0   1   0   0   0   0   0   0   0 189   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   2  13   1   2   0   0   0   0 191   0  

## According to Research paper

In [None]:
X=df['Text']
le=LabelEncoder()
le_y=le.fit_transform(y)
data_list=[]
train_x,test_x,train_y,test_y=train_test_split(X, y, test_size=0.2)
tfidf_vectorizer = TfidfVectorizer()

train_x_tfidf = tfidf_vectorizer.fit_transform(train_x)

test_x_tfidf = tfidf_vectorizer.transform(test_x)


In [None]:
model=MultinomialNB()
model.fit(train_x_tfidf,train_y)
y_pred=model.predict(test_x_tfidf)

In [None]:
ac=accuracy_score(test_y,y_pred)
cm=confusion_matrix(test_y,y_pred)
cr=classification_report(test_y,y_pred)
print(ac)
print(cm)
print(cr)

0.9384090909090909
[[195   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0  94   1  27   0   1   0   1   1   0  69   0   0   0   1   1   2   0
    0   0   2   0]
 [  0   0 205   0   0   3   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0 201   0   0   0   0   0   0   0   0   0   0   1   0   0   0
    0   0   0   0]
 [  0   0   1   6 197   2   0   0   0   0   0   0   0   0   0   2   0   0
    0   0   0   0]
 [  0   0   0   1   0 205   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   5   0   0 200   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   4   0   0   0 204   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   3   1   7   2   1   0   2 130   0  47   0   0   0   1   0   2   3
    0   0   0   0]
 [  0   0   0   4   0   0   0   0   0 214   2   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   1  10   0   3   0   0   0   0 165   0  