In [None]:
import pandas as pd
import numpy as np



In [None]:
train = pd.read_csv("data/torob_train.csv")
test = pd.read_csv("data/torob_test.csv")

## Preprocessing the data

In [None]:
from hazm import Normalizer

normalizer = Normalizer()


train['name1'] = train['name1'].apply(normalizer.normalize)
test['name1'] = test['name1'].apply(normalizer.normalize)


In [None]:
from hazm import stopwords_list

stopwords = set(stopwords_list())
# Removing stop words 
train['name1'] = train['name1'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwords))


In [None]:
test['name1'] = test['name1'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwords))

In [None]:
from hazm import Lemmatizer

lemmatizer = Lemmatizer()

# Apply lemmatization
train['name1'] = train['name1'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))
test['name1'] = test['name1'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

In [None]:
from hazm import word_tokenize

# Tokenize the text
train['name1'] = train['name1'].apply(word_tokenize)


In [None]:
test['name1'] = test['name1'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['name1'])

sequences = tokenizer.texts_to_sequences(train['name1'])

max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

train['padded_name1'] = padded_sequences.tolist()

print(train)


In [None]:
test_sequences = tokenizer.texts_to_sequences(test['name1'])

test_padded_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length)

In [None]:

from sklearn.model_selection import train_test_split
X = np.array(train['padded_name1'].tolist())

y = np.array(train['cat_id'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.regularizers import l2
# Hyperparameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size (+1 for padding)
embedding_dim = 60  # Dimensionality of the embedding space
max_sequence_length = X.shape[1]  # Maximum sequence length (from padded sequences)
num_categories = len(np.unique(y))  # Number of output categories


model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(dropout_rate),
    Dense(num_categories, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model architecture
model.summary()


In [None]:
import matplotlib.pyplot as plt


from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

history = model.fit(
    X_train_final, y_train_final,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1
)

plt.figure(figsize=(12, 5))


plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'])


plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'])

plt.show()
