In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 10, random_state = 0, shuffle = True)

#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

In [20]:
def all_metrics_together(y, probs, preds):
    accuracy = metrics.accuracy_score(y, preds)
    recall = metrics.recall_score(y, preds)
    precision = metrics.precision_score(y, preds)
    f1 = metrics.f1_score(y, preds)
    auc = metrics.roc_auc_score(y, (probs))

    df = pd.DataFrame({'Accuracy': accuracy, 'Recall': recall, 'Precision': precision, 'F1': f1, 'AUC': auc}, index = ['Score'])
    return df

In [21]:
char_to_num = {}
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
for idx, char in enumerate(letters):
    char_to_num[char] = idx
print(len(letters))

26


In [22]:
def validString(string):
    string = str(string[0])
    for char in string:
        if char not in letters:
            return False
    return True

In [23]:
df_eng = pd.read_csv('English_25k.txt', header = None)
mask = df_eng.apply(validString, axis = 1)
df_eng = df_eng[mask]
df_eng.columns = ['word']
df_eng.drop_duplicates(inplace = True)
df_eng['language'] = 1
max_length_of_eng_words = df_eng['word'].str.len().value_counts().index.values.max()
print('Max length of eng words:', max_length_of_eng_words)
print('Number of English words:',df_eng.shape[0])

Max length of eng words: 20.0
Number of English words: 25321


In [24]:
df_tr = pd.read_json('Turkish_92k.json')[['word']]
replacement_map_for_tr = {' ': '', 'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ö': 'o', 'ü': 'u', 'ş': 's', 
                          'â': 'a', 'î': 'i', 'i̇': 'i', 'û': 'u', '(': '', ')': '', ',': '', '-': ''}
df_tr.loc[:, 'word'] = df_tr['word'].str.lower()
for old_char in replacement_map_for_tr:
    df_tr.loc[:, 'word'] = df_tr.loc[:, 'word'].str.replace(old_char, replacement_map_for_tr[old_char])
    
df_tr = df_tr[df_tr['word'].str.len() <= 20].reset_index(drop = True)
df_tr = df_tr[df_tr.apply(validString, axis = 1)]
df_tr.drop_duplicates(inplace = True)
max_length_of_tr_words = df_tr['word'].str.len().value_counts().index.values.max()
df_tr['language'] = 0
print('Max length of tr words:', max_length_of_tr_words)
print('Number of Turkish words:',df_tr.shape[0])

Max length of tr words: 20
Number of Turkish words: 86830


In [25]:
max_word_len = max(max_length_of_tr_words, max_length_of_eng_words)
def word_to_vec(word):
    zeros = np.zeros(26)
    word_vec = []
    # padding
    for i in range(max_word_len - len(str(word))):
        word_vec.append(zeros)
        
    for char in str(word):
        char_vec = np.zeros(26)
        char_vec[char_to_num[char]] = 1
        word_vec.append(char_vec)
    return np.array(word_vec)

In [84]:
df = pd.concat([df_tr, df_eng])
df = df.sample(frac=1).drop_duplicates(subset = 'word')
y = df['language'].values
df.sample(3)

Unnamed: 0,word,language
4958,cosmically,1
15246,darulfunun,0
86578,hangibir,0


In [27]:
%%time
X = []
for word in df['word']:
    word_vec = word_to_vec(word)
    X.append(word_vec)
X = np.array(X)

Wall time: 2.28 s


In [29]:
%%time
epochs = 250
batch_size = 256
earlystopping = EarlyStopping(monitor='val_loss', patience = 10, restore_best_weights = True)

lst_y = np.array([])
lst_yhat = np.array([])
lst_yhat_probs = np.array([])
lst_X_test = np.array([]).reshape((0, 20, 26))

fold = 1

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lst_X_test = np.concatenate([lst_X_test, X_test])

    model = Sequential()
    model.add(LSTM(max_word_len, input_shape = (max_word_len, 26)))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(loss = 'mean_squared_error', optimizer = 'Adam')
    model.fit(X_train, y_train, epochs = epochs, batch_size = batch_size, verbose = 0, validation_data = (X_test, y_test),
             use_multiprocessing = True, callbacks=[earlystopping])
    
    
    y_hat_probs = model.predict(X_test).reshape(-1)
    y_fold_preds = (y_hat_probs >= 0.5) * 1
    
    lst_y = np.concatenate([lst_y, y_test])
    lst_yhat = np.concatenate([lst_yhat, y_fold_preds])
    lst_yhat_probs = np.concatenate([lst_yhat_probs, y_hat_probs])
    
    print('Fold', fold, 'completed')
    fold += 1
    
y_test = lst_y.astype(int)
y_hat = lst_yhat.astype(int)
y_probs = lst_yhat_probs

Fold 1 completed
Fold 2 completed
Fold 3 completed
Fold 4 completed
Fold 5 completed
Fold 6 completed
Fold 7 completed
Fold 8 completed
Fold 9 completed
Fold 10 completed
Wall time: 24min 22s


In [30]:
all_metrics_together(y_test, y_probs, y_hat)

Unnamed: 0,Accuracy,Recall,Precision,F1,AUC
Score,0.963799,0.914063,0.924727,0.919364,0.992553


In [31]:
mispred_mask = y_test != y_hat
mispreds = lst_X_test[mispred_mask]

### One Hot Vector to Words

In [32]:
mispred_words = []
for word_vec in mispreds:
    word = ''
    for char_vec in word_vec:
        if not any(char_vec):
            char = ''
        else:
            char = letters[np.argmax(char_vec)]
        word = word + char
        
    mispred_words.append(word)

In [33]:
mispred_df = pd.DataFrame({'word': mispred_words, 'y': y_test[mispred_mask], 'y_hat': y_hat[mispred_mask]})
mispred_df = mispred_df.astype(str)
mispred_df.loc[:, 'y'] = mispred_df.loc[:, 'y'].str.replace('0', 'tr').str.replace('1', 'eng')
mispred_df.loc[:, 'y_hat'] = mispred_df.loc[:, 'y_hat'].str.replace('0', 'tr').str.replace('1', 'eng')

In [34]:
mispred_df.sample(10)

Unnamed: 0,word,y,y_hat
228,bayou,eng,tr
1002,fenomenist,tr,eng
3227,disk,tr,eng
1775,thinktank,tr,eng
3981,optometrist,eng,tr
51,farad,tr,eng
2243,delta,eng,tr
1229,antre,tr,eng
800,testosterone,eng,tr
3510,kidder,eng,tr


### Train with Whole Data

In [88]:
epochs = 500
batch_size = 256
earlystopping = EarlyStopping(monitor='loss', patience = 10, restore_best_weights = True)

model = Sequential()
model.add(LSTM(max_word_len, input_shape = (max_word_len, 26), dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'mean_squared_error', optimizer = 'Adam')
model.fit(X, y, epochs = epochs, batch_size = batch_size, verbose = 0,
         use_multiprocessing = True, callbacks=[earlystopping])

<tensorflow.python.keras.callbacks.History at 0xaa1c798340>

### Predict Your Own Word

In [109]:
word = 'meliksah'

prob = model.predict(word_to_vec(word).reshape(1, 20, 26))
if word in df['word'].values:
    print("Word", word, 'was in training data')
else:
    print("Word", word, 'was NOT in training data')
    
    
if prob >= 0.5:
    print(word, 'is predicted to be an English word with probability %.2f' %prob[0][0])
else:
    print(word, 'is predicted to be a Turkish word with probability %.2f' %(1-prob[0][0]))


Word meliksah was NOT in training data
meliksah is predicted to be a Turkish word with probability 1.00
