<a href="https://colab.research.google.com/github/neonithinar/Language_detection/blob/main/Language_detection_version_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Language Detection using Tatoeba dataset.
This time I will be using GRU(maybe) to train and categorize the languages. This notebook will use the same preprocessing steps as in the preliminary studies/attempt. But this time we will be using more of the keras framework and tokenizer functionalities to address the ram overloading issues.

In [None]:
# Common Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import pickle
import tensorflow as tf
from tensorflow import keras
# from sklearn.preprocessing import MinMaxScaler

In [None]:
# Download the dataset
! wget https://downloads.tatoeba.org/exports/sentences.csv

--2021-03-01 04:58:00--  https://downloads.tatoeba.org/exports/sentences.csv
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 512772532 (489M) [application/octet-stream]
Saving to: ‘sentences.csv’


2021-03-01 04:58:16 (30.0 MB/s) - ‘sentences.csv’ saved [512772532/512772532]



In [None]:
df = pd.read_csv("sentences.csv", sep= '\t', encoding= 'utf8', index_col= 0, names=['lang', 'text'])
length_condition = [True if 20 <= len(sen) <= 200 else False for sen in df['text']]

df = df[length_condition]

In [None]:
df.head()

Unnamed: 0,lang,text
5,cmn,今天是６月１８号，也是Muiriel的生日！
21,cmn,选择什么是“对”或“错”是一项艰难的任务，我们却必须要完成它。
67,cmn,我们看东西不是看其实质，而是以我们的主观意识看它们的。
71,cmn,生活就是當你忙著進行你的計劃時總有其他的事情發生。
77,deu,Lass uns etwas versuchen!


In [None]:
df['lang'].unique()

array(['cmn', 'deu', 'rus', 'fra', 'eng', 'spa', 'ita', 'jpn', 'kor',
       'vie', 'nld', 'epo', 'por', 'tur', 'heb', 'ell', 'ind', 'ara',
       'arz', 'fin', 'bul', 'swe', 'ukr', 'bel', 'que', 'ces', 'nno',
       'nob', 'zsm', 'est', 'kat', 'pol', 'lat', 'yue', 'swh', 'sqi',
       'hun', 'isl', 'wuu', 'fry', 'afr', 'ron', 'fao', 'san', 'bre',
       'tat', 'yid', 'uig', 'uzb', 'srp', 'qya', 'pes', nan, 'slk', 'eus',
       'cycl', 'dan', 'acm', 'lvs', 'kaz', 'hye', 'hin', 'ben', 'cat',
       'bos', 'hrv', 'lit', 'tha', 'urd', 'orv', 'cha', 'tgl', 'mon',
       'scn', 'gle', 'slv', 'frm', 'glg', 'vol', 'jbo', 'toki', 'ina',
       'nds', 'mal', 'tlh', 'roh', 'lzh', 'ltz', 'oss', 'ido', 'gla',
       'mlt', 'ast', 'oci', 'ile', 'ota', 'xal', 'tel', 'sjn', 'nov',
       'khm', 'tpi', 'ang', 'aze', 'tgk', 'tuk', 'chv', 'mkd', 'hsb',
       'dsb', 'sme', 'cym', 'mri', 'ksh', 'kmr', 'ewe', 'ber', 'udm',
       'lld', 'pms', 'lad', 'grn', 'mlg', 'xho', 'pnb', 'grc', 'npi',
       'tpw',

### This time we will be atempting 20 languages with a naive word tokenizer approach

In [None]:
languages = ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'rus','hun', 'tur', 'ukr',
             'nld', 'fin', 'pol', 'lit', 'ces', 'swe', 'lvs', 'ara', 'dan','srp']# 20 languages
# lang1 = ['deu', 'eng', 'fra', 'ita', 'por', 'spa', 'rus','hun', 'jpn', 'ukr'] 
# lang2 = ['nld', 'fin', 'pol', 'lit', 'ces', 'swe',  'lvs' 'hin', 'dan','srp'] # 10 languages
# lang3 = ['tur', 'epo', 'cmn' , 'pes', 'kab', 'mar', 'heb', 'bul', 'ron', 'ell'] # 10 languages
# 
#   'ara','mkd'
#             dutch, finnish, polish, lituanian, czech, swedish, arabic, macedonian, danish, and serbian 
# we will be missing the Lativan and slovakian and slovene language from europarl

In [None]:
total_lang = df['lang'].unique()
for i in languages:
  print(i in total_lang, i)

True deu
True eng
True fra
True ita
True por
True spa
True rus
True hun
True tur
True ukr
True nld
True fin
True pol
True lit
True ces
True swe
True lvs
True ara
True dan
True srp


In [None]:
df = df[df['lang'].isin(languages)]
# df.info()
# df.head()
# df['lang'].unique()



# trim the dataset
df_trim = pd.DataFrame(columns=['lang', 'text'])

for l in languages:
  lang_trim = df[df['lang'] == l].sample(50000, random_state = 42, replace = True)
  df_trim = df_trim.append(lang_trim)

# Create random train, val, and test set

df_shuffle = df_trim.sample(frac = 1)
df_shuffle.shape


(1000000, 2)

In [None]:
df_shuffle.head()

Unnamed: 0,lang,text
8671686,spa,Deberías ir allí ahora.
695251,fra,F est égal à 15 en hexadécimal.
6810967,ara,قامت ليلى بحلق رأسها.
2034106,deu,"Mein Herr, erkennen Sie mich denn tatsächlich ..."
2147247,pol,Obydwie siostry są blondynkami.


### Cleaning the dataset

In [None]:
df_shuffle["text"] = df_shuffle["text"].str.lower() #converting to lower case
df_shuffle["text"] = df_shuffle['text'].str.replace(r'[^\w\s]+ \n', '')  # using regex to remove special characters

In [None]:
df_shuffle.head()

Unnamed: 0,lang,text
8671686,spa,deberías ir allí ahora.
695251,fra,f est égal à 15 en hexadécimal.
6810967,ara,قامت ليلى بحلق رأسها.
2034106,deu,"mein herr, erkennen sie mich denn tatsächlich ..."
2147247,pol,obydwie siostry są blondynkami.


### Split the dataset

In [None]:
train_set = df_shuffle[:210000]
validation_set = df_shuffle[210000: 270000]
test_set = df_shuffle[270000:300000]


In [None]:
train_set.shape

(210000, 2)

In [None]:
print(len(train_set), len(validation_set), len(test_set))

210000 60000 30000


In [None]:
train_set.columns

Index(['lang', 'text'], dtype='object')

# Trigram extraction and tokenizing

Now that we have done with splitting train, validation and test set splits, we can move to the hard part. extracting the trigrams for each language and somehow create a tokenized vocabulary from it to feed it into our neural network. This time, however we will be using sentence sequences as such and the layers to use will be GRU (preferably)

Now lets move on to create a vocabulary for each language with its 200 most common trigrams

In [None]:
def get_trigrams(corpus, num_features = 200):
  """ Returns a list of N most commmon trigrams from a list of 
      sentences:
      corpus: List of strings
      num_features: (int) number of features
      """
  vectorizer = CountVectorizer(analyzer= "char", ngram_range= (3, 3), max_features= num_features)
  X = vectorizer.fit_transform(corpus)

  feature_names = vectorizer.get_feature_names()
  return feature_names

In [None]:

features = {}
feature_set = set()

for l in languages:
  corpus = train_set[train_set.lang == l]['text']
  # get 200 most frequent trigrams. (num of features can be changed by changing num_features)
  trigrams = get_trigrams(corpus, num_features= 200)
  features[l] = trigrams
  feature_set.update(trigrams)



```feature_set``` is a list of all trigram features 
```features``` is a dictionary containing all the trigrams mapped to corresponding languages



In [None]:
# create a vocabulary using feature set
vocab = dict()
for i, f in enumerate(feature_set):
  vocab[f] = i
# vocab is a list of trigrams with indexed values. Dont know how that will help

In [None]:
features['ara'][:5]

[' أح', ' أس', ' أم', ' أن', ' أي']

now that we have all the features set up as a dict, and list of all features( without tgt language) in feature_set, next step is to transform the training set with a more efficient method than one-hot encoding. as of now, RAM usage is below 20%. the idea is to transform the training set with each langugage and tokenize the trigrams in the each of sentence along the way with suitable padding. and then use some sort of embedding. Hopefully embedding will cluster same languages together.

In [None]:
len(feature_set)

2055

# test section block

In [None]:
trial_set = train_set['text'][:5]
trial_set

8671686                              deberías ir allí ahora.
695251                       f est égal à 15 en hexadécimal.
6810967                                قامت ليلى بحلق رأسها.
2034106    mein herr, erkennen sie mich denn tatsächlich ...
2147247                      obydwie siostry są blondynkami.
Name: text, dtype: object

In [None]:
for review in trial_set:
  
  review = tf.strings.regex_replace(review, rb"<br\s*/?>", b" ")
  review = tf.strings.split(review)
  print(review)
  # review = review.to_tensor(default_value= b"<pad>")

# trial_set = tf.convert_to_tensor(trial_set)
# trial_set

tf.Tensor([b'deber\xc3\xadas' b'ir' b'all\xc3\xad' b'ahora.'], shape=(4,), dtype=string)
tf.Tensor([b'f' b'est' b'\xc3\xa9gal' b'\xc3\xa0' b'15' b'en' b'hexad\xc3\xa9cimal.'], shape=(7,), dtype=string)
tf.Tensor(
[b'\xd9\x82\xd8\xa7\xd9\x85\xd8\xaa' b'\xd9\x84\xd9\x8a\xd9\x84\xd9\x89'
 b'\xd8\xa8\xd8\xad\xd9\x84\xd9\x82'
 b'\xd8\xb1\xd8\xa3\xd8\xb3\xd9\x87\xd8\xa7.'], shape=(4,), dtype=string)
tf.Tensor(
[b'mein' b'herr,' b'erkennen' b'sie' b'mich' b'denn' b'tats\xc3\xa4chlich'
 b'nicht' b'wieder?'], shape=(9,), dtype=string)
tf.Tensor([b'obydwie' b'siostry' b's\xc4\x85' b'blondynkami.'], shape=(4,), dtype=string)


8671686                              deberías ir allí ahora.
695251                       f est égal à 15 en hexadécimal.
6810967                                قامت ليلى بحلق رأسها.
2034106    mein herr, erkennen sie mich denn tatsächlich ...
2147247                      obydwie siostry są blondynkami.
Name: text, dtype: object

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(oov_token= 1000000)
tokenizer.fit_on_texts(train_set['text'])

In [None]:
tokenizer.texts_to_sequences(['فقط'])

[[1969]]

In [None]:
len(tokenizer.word_index)

201234

In [None]:
X_batch = tokenizer.texts_to_sequences(train_set['text'])

In [None]:
X_batch

[[19, 103, 2817, 5645, 47, 1141, 2159, 5238, 59, 53160, 85091, 85092],
 [39269, 93, 2818, 7444, 7, 910, 5, 39270, 17082, 85093],
 [2, 724, 104, 3915, 796, 187, 308, 53161, 183, 4285, 51, 2943, 25],
 [16, 2, 17083, 3534, 9, 3, 21, 56, 15342, 1456],
 [432, 26233, 85094, 1969],
 [5883, 518, 7445, 170, 39271, 85095],
 [10, 3432, 1112, 4, 264, 53162, 178, 7844, 117, 85096, 4, 31383],
 [131, 1675, 688, 53163, 7845],
 [107, 27, 8320, 265, 110, 24, 19426],
 [15343, 311, 31384, 24, 85097, 311, 17084],
 [40, 7090, 11, 357, 3, 8321, 8, 31385],
 [2, 122, 5884, 2439, 3196, 85098, 4, 9410],
 [31, 7091, 53164, 7446, 934],
 [391, 2748, 195, 6447, 13883],
 [1385, 53165, 85099, 19427, 85100],
 [1153, 1970, 4740, 53166, 39272],
 [764, 53167, 5, 3535, 17, 53168, 187, 85101],
 [39273, 1500, 875, 350, 186, 527, 53169, 85102, 161, 85103, 85104, 6751],
 [696,
  3280,
  23,
  85105,
  12661,
  549,
  2394,
  22423,
  709,
  17,
  62,
  113,
  31386,
  108,
  13,
  567,
  113,
  1213],
 [390, 7092, 11695, 10, 5

In [None]:
X_batch = train_set["text"]

In [None]:
# X_batch = tf.Tensor(X_batch)
for sentence in X_batch:
  sentence = tf.strings.substr(sentence, 0, 200)
  sentence = tf.strings.split(sentence)


KeyboardInterrupt: ignored

In [None]:
X_batch


8841968    том је већ знао да ће мери бити на његовој рођ...
6155925    szaleństwem jest robić wciąż to samo i oczekiw...
3568982    tom tog et æble ud af sin madkasse og begyndte...
6995032    o tom costumava pensar que a mary não gostava ...
6966551                                  هذه حفلة للشباب فقط
                                 ...                        
7258522                                 tom ha protetto mary
1879283                        jag älskar det norska språket
2714607                               jis mėgsta dirbti sode
9761419             jai acheté ce livre dans cette librairie
1359873                 estoy de acuerdo con él en ese punto
Name: text, Length: 210000, dtype: object