<a href="https://colab.research.google.com/github/mircii/MILO/blob/main/01_src/02_colab/MILO_Licenta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**MILO**
**Release v1.0**

- Machine Learning Model
- Conversational NLP
- Trained on https://www.kaggle.com/datasets/kreeshrajani/3k-conversations-dataset-for-chatbot translated in Romanian

## Translating to romanian and converting the dataset to .json

In [None]:
pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [None]:
import pandas as pd
from googletrans import Translator
import json
import time  # To add a delay between requests
import random  # To introduce variability in delays

# Load the dataset
df = pd.read_csv('Conversation.csv')

# Initialize the translator
translator = Translator()

# Function to translate text
def translate_text(text, target_language="ro"):
    if not text:  # Check if the text is empty or None
        return ""  # Return empty string if there's no text to translate

    try:
        translated = translator.translate(text, src='en', dest=target_language)
        return translated.text
    except Exception as e:
        print(f"Error during translation: {e}")
        return ""  # Return empty string in case of error

# Create a list to store the translated conversations
translated_conversations = []

# Loop through each row in the dataframe
for index, row in df.iterrows():
    # Extracting question and answer
    question = row.get('question', '')
    answer = row.get('answer', '')

    # Translate the question and answer
    question_ro = translate_text(question, 'ro')
    answer_ro = translate_text(answer, 'ro')

    # Add the translated conversation to the list
    translated_conversations.append({
        "question_ro": question_ro,
        "answer_ro": answer_ro
    })


# Save the translated conversations to a JSON file
with open('Conversation_romanian.json', 'w', encoding='utf-8') as f:
    json.dump(translated_conversations, f, ensure_ascii=False, indent=4)

print("Translated conversations have been saved to 'Conversation_romanian.json'")


Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Error during translation: The read operation timed out
Error during translation: the JSON object must be str, bytes or bytearray, not NoneType
Translated conversations have been saved to 'Conversation_romanian.json'


## Preprocessing the data.

In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Extracting the questions and answers
with open('Conversation_romanian.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

questions = [item['question_ro'] for item in data]
answers = [item['answer_ro'] for item in data]

In [2]:
# Visualizing the first 10 questions and answers

print("First 10 questions:", questions[:10])
print("First 10 answers:", answers[:10])

First 10 questions: ['Bună, ce mai faci?', 'Sunt bine.Ce zici de tine?', 'Sunt destul de bun.Mulțumesc că ai întrebat.', 'nici o problemă.Deci, cum ai fost?', 'Am fost grozav.şi tu?', 'Am fost bun.Sunt la școală chiar acum.', 'La ce școală mergi?', 'Merg la PCC.', 'Îți place acolo?', 'e în regulă.Este un campus cu adevărat mare.']
First 10 answers: ['Sunt bine.Ce zici de tine?', 'Sunt destul de bun.Mulțumesc că ai întrebat.', 'nici o problemă.Deci, cum ai fost?', 'Am fost grozav.şi tu?', 'Am fost bun.Sunt la școală chiar acum.', 'La ce școală mergi?', 'Merg la PCC.', 'Îți place acolo?', 'e în regulă.Este un campus cu adevărat mare.', 'Noroc cu școala.']


## Tokenization.

In [3]:
tokenizer = Tokenizer(oov_token='<OOV>')

# Fit the tokenizer on both questions and answers
tokenizer.fit_on_texts(questions + answers)

VOCAB_SIZE = len(tokenizer.word_index) + 1  # +1 to include padding/OOV token

# Convert text to sequences
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

word_index = tokenizer.word_index

In [4]:
# Visualizing the vocabulary
print(f"Vocabulary Size: {VOCAB_SIZE}")
print("First 10 words in vocabulary:", list(word_index.items())[:10])

Vocabulary Size: 3598
First 10 words in vocabulary: [('<OOV>', 1), ('de', 2), ('nu', 3), ('să', 4), ('o', 5), ('este', 6), ('a', 7), ('ce', 8), ('că', 9), ('am', 10)]


## Padding the sequences.

In [5]:
MAX_sequence_length = max(len(seq) for seq in question_sequences + answer_sequences)

question_padded = pad_sequences(question_sequences, maxlen=MAX_sequence_length, padding='post')
answer_padded = pad_sequences(answer_sequences, maxlen=MAX_sequence_length, padding='post')

In [6]:
# Visualizing the padding

print("First padded question sequence:", question_padded[0])
print("First padded answer sequence:", answer_padded[0])

First padded question sequence: [95  8 21 82  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
First padded answer sequence: [ 19  20   8 229   2  67   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0]


## Training & Testing Sets.

In [7]:
# Using train test spilt in oreder to create the sets
q_train, q_test, a_train, a_test = train_test_split(question_padded, answer_padded, test_size=0.2, random_state=42)

In [8]:
# Visualizing the train & test sets shapres and lengths
print("Training questions shape:", q_train.shape)
print("Training answers shape:", a_train.shape)
print("Testing questions shape:", q_test.shape)
print("Testing answers shape:", a_test.shape)

print("Length of training questions:", len(q_train))
print("Length of training answers:", len(a_train))
print("Length of testing questions:", len(q_test))
print("Length of testing answers:", len(a_test))

Training questions shape: (2980, 21)
Training answers shape: (2980, 21)
Testing questions shape: (745, 21)
Testing answers shape: (745, 21)
Length of training questions: 2980
Length of training answers: 2980
Length of testing questions: 745
Length of testing answers: 745


## Building the Neural Network

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Defining some Hyperparameters
EMBEDDING_DIM = 256
GRU_UNITS = 128
EPOCHS = 10
BATCH_SIZE = 64

MILO_01 = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    tf.keras.layers.GRU(GRU_UNITS, return_sequences=True),
    tf.keras.layers.GRU(GRU_UNITS, return_sequences=True),
    tf.keras.layers.Dense(VOCAB_SIZE)
])

MILO_01.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.05, patience=1, restore_best_weights=True)

MILO_01.fit(q_train, a_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(q_test, a_test))

Epoch 1/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 554ms/step - accuracy: 0.6289 - loss: 3.9717 - val_accuracy: 0.6877 - val_loss: 2.3644
Epoch 2/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 480ms/step - accuracy: 0.6913 - loss: 2.2690 - val_accuracy: 0.6913 - val_loss: 2.3545
Epoch 3/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 452ms/step - accuracy: 0.6943 - loss: 2.2044 - val_accuracy: 0.6917 - val_loss: 2.4024
Epoch 4/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 443ms/step - accuracy: 0.6954 - loss: 2.1312 - val_accuracy: 0.6921 - val_loss: 2.3905
Epoch 5/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 499ms/step - accuracy: 0.6949 - loss: 2.1091 - val_accuracy: 0.6904 - val_loss: 2.4131
Epoch 6/10
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 452ms/step - accuracy: 0.6944 - loss: 2.0890 - val_accuracy: 0.6906 - val_loss: 2.4540
Epoch 7/10
[1m47/47[

<keras.src.callbacks.history.History at 0x7c6c8a704690>

In [10]:
MILO_01.summary()

## Saving the model

In [13]:
MILO_01.save('MILO_01.keras')

## Making predictions

In [11]:
def chatbot_response(user_input):
  input_sequence = tokenizer.texts_to_sequences([user_input])
  padded_sequence = pad_sequences(input_sequence, maxlen=MAX_sequence_length, padding='post')
  predicted_sequence = MILO_01.predict(padded_sequence)
  response_text = decode_response(predicted_sequence)
  return response_text

def decode_response(predicted_sequence):
  predicted_indices = np.argmax(predicted_sequence, axis=-1)
  response_words = [tokenizer.index_word.get(idx, '') for idx in predicted_indices[0]]
  return ' '.join(response_words)

while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break
    response = chatbot_response(user_input)
    print("MILO_01:", response)


KeyboardInterrupt: Interrupted by user