In [2]:
import pandas as pd

sentences_df = pd.read_csv('./dataset/sentences.csv', sep='\t', header=None)

sentences_df.drop(columns=[0], inplace=True)
sentences_df.dropna(inplace=True)
sentences_df.columns = ['Language', 'Text']
sentences_df.drop(sentences_df[sentences_df['Language'] == '\\N'].index, inplace=True)

In [3]:
sentences_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12084317 entries, 0 to 12084783
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   Language  object
 1   Text      object
dtypes: object(2)
memory usage: 276.6+ MB


In [4]:
sentences_df.head()

Unnamed: 0,Language,Text
0,cmn,我們試試看！
1,cmn,我该去睡觉了。
2,cmn,你在干什麼啊？
3,cmn,這是什麼啊？
4,cmn,今天是６月１８号，也是Muiriel的生日！


In [5]:
sentences_df.shape

(12084317, 2)

In [6]:
classes = sentences_df['Language'].unique()
classes.shape

(417,)

In [7]:
X = sentences_df['Text']
y = sentences_df['Language']

In [8]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

In [9]:
from keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(sequences, maxlen=20, padding='post')

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, random_state=0, test_size=0.2, shuffle=True)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9667453, 30), (2416864, 30), (9667453,), (2416864,))

In [11]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_accuracy', mode='auto', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', patience=3, verbose=1, factor=0.5, min_lr=0.00001)

In [12]:
epochs = 20
batch_size = 128

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Embedding, LSTM

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=X_train.shape[1]))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(Dense(classes.shape[0], activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

2024-07-11 23:17:50.814455: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 417)               53793     
                                                                 
Total params: 1465889 (5.59 MB)
Trainable params: 1465633 (5.59 MB)
Non-trainable params: 256 (1.00 KB)
________________

In [14]:
output = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                   epochs=epochs, batch_size=batch_size,
                   callbacks=[early_stopping, reduce_lr])

# Save the model
path = "./model.h5"
model.save(path)

# Evaluate the model
eval = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)

Epoch 1/20


2024-07-11 23:17:53.035242: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2024-07-11 23:17:53.690916: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f46c34b7420 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-11 23:17:53.690945: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti, Compute Capability 8.9
2024-07-11 23:17:53.699462: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1720703873.767384  278240 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 13/20
 6410/75527 [=>............................] - ETA: 7:49 - loss: 0.3423 - accuracy: 0.9089

KeyboardInterrupt: 