<a href="https://colab.research.google.com/github/mariyajoseph2002/ml/blob/main/training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense,Dropout,Flatten,Embedding


In [2]:
import random
# Set all random seeds for reproducibility
np.random.seed(42)        # NumPy operations
random.seed(42)           # Python's built-in random
tf.random.set_seed(42)    # TensorFlow/Keras

In [3]:
from google.colab import files
uploaded = files.upload()

Saving balanced_traindata.txt to balanced_traindata.txt


In [4]:
data=pd.read_csv("balanced_traindata.txt",sep=";")
data.columns=["Text","Emotion"]
data.head(5)

Unnamed: 0,Text,Emotion
0,im feeling uncharacteristically smug to some e...,joy
1,i feel so lame and annoying and generally unli...,sadness
2,i sat there feeling so amazed that i actually ...,surprise
3,i feel like todays sweet treat would be someth...,joy
4,i am a boy i like girls they are pretty and i ...,surprise


In [None]:
data.describe()

Unnamed: 0,Text,Emotion
count,15592,15592
unique,13476,6
top,i feel so amazed ive had views in the past week,joy
freq,7,3000


In [17]:
texts=data["Text"].tolist()
labels=data["Emotion"].tolist()

In [6]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)

In [7]:
import pickle

# Save tokenizer after training
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [12]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)


In [15]:
sequences=tokenizer.texts_to_sequences(texts)
#sequences = [seq if seq else [0] for seq in tokenizer.texts_to_sequences(texts)]
max_length = min(66, max([len(seq) for seq in sequences]))  # Cap at 66 words
#max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences,maxlen = max_length)

In [20]:
le=LabelEncoder()
labels=le.fit_transform(labels)

In [21]:
one_hot_labels=keras.utils.to_categorical(labels)

In [22]:
#x_train,x_test,y_train,y_test=train_test_split(padded_sequences,one_hot_labels,test_size=0.2)
x_train, x_test, y_train, y_test = train_test_split(
    padded_sequences, one_hot_labels, test_size=0.2, stratify=one_hot_labels
)


In [23]:
print("Emotion to Index Mapping:", le.classes_)


Emotion to Index Mapping: ['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']


In [24]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [25]:
from keras.regularizers import l2

In [26]:
model=Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,output_dim=128,input_length=max_length))
model.add(Flatten())
model.add(Dense(units=128,activation="relu", kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(units=len(one_hot_labels[0]),activation="softmax"))
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), callbacks=[early_stopping])




Epoch 1/10
[1m390/390[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m18s[0m 40ms/step - accuracy: 0.2205 - loss: 2.1112 - val_accuracy: 0.4104 - val_loss: 1.5783
Epoch 2/10
[1m390/390[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m21s[0m 41ms/step - accuracy: 0.5004 - loss: 1.4788 - val_accuracy: 0.7272 - val_loss: 1.1690
Epoch 3/10
[1m390/390[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.7880 - loss: 1.0080 - val_accuracy: 0.7980 - val_loss: 0.9528
Epoch 4/10
[1m390/390[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.8989 - loss: 0.7094 - val_accuracy: 0.8211 - val_loss: 0.8555
Epoch 5/10
[1m390/390[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m15s[0m 38ms/step - accuracy: 0.9315 - loss: 0.5677 - val_accuracy: 0.8272

<keras.src.callbacks.history.History at 0x7e5dd11f1510>

In [27]:
def adjust_softmax(output, temperature=0.5):
    exp_preds = np.exp(np.log(output) / temperature)
    return exp_preds / np.sum(exp_preds)

In [28]:
dummy_input = np.random.rand(1, max_length).astype(np.float32)  # Dummy input to test
raw_output = model.predict(dummy_input)

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 88ms/step


In [30]:
adjusted_output = adjust_softmax(raw_output)

print("\nüî¢ Raw Model Output:\n", raw_output)
print("\nüî• Adjusted Softmax Output:\n", adjusted_output)


üî¢ Raw Model Output:
 [[0.02020236 0.24990867 0.28555515 0.36824885 0.05374477 0.02234011]]

üî• Adjusted Softmax Output:
 [[0.00144014 0.220376   0.2877277  0.47850278 0.01019234 0.00176105]]


In [None]:
input_text=" I am very happy today! "
input_sequence=tokenizer.texts_to_sequences([input_text])
padded_input_sequence=pad_sequences(input_sequence,maxlen=max_length)
prediction=model.predict(padded_input_sequence)
predicted_label=le.inverse_transform([np.argmax(prediction[0])])
print(predicted_label)

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 43ms/step
['joy']


In [33]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [34]:
model_path="/content/drive/My Drive/Colab Notebooks/Data/emotionn_model.h5"

In [37]:
model.save(model_path)



In [38]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

Saved artifact at '/tmp/tmp7w6bbij5'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 66), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 6), dtype=tf.float32, name=None)
Captures:
  138941390719312: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138941390718928: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138941390720464: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138941390720848: TensorSpec(shape=(), dtype=tf.resource, name=None)
  138941390721616: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [39]:
with open("emotionn_model.tflite", "wb") as f:
    f.write(tflite_model)

print("TFLite model saved as emotionn_model.tflite")

TFLite model saved as emotionn_model.tflite


In [40]:
from google.colab import files
files.download("emotionn_model.tflite")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
interpreter = tf.lite.Interpreter(model_path="emotionn_model.tflite")
interpreter.allocate_tensors()

In [42]:
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [43]:
print("\nüîπ INPUT DETAILS:")
for details in input_details:
    print(details)


üîπ INPUT DETAILS:
{'name': 'serving_default_keras_tensor:0', 'index': 0, 'shape': array([ 1, 66], dtype=int32), 'shape_signature': array([-1, 66], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}


In [44]:
print("\nüîπ OUTPUT DETAILS:")
for details in output_details:
    print(details)


üîπ OUTPUT DETAILS:
{'name': 'StatefulPartitionedCall_1:0', 'index': 17, 'shape': array([1, 6], dtype=int32), 'shape_signature': array([-1,  6], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}


In [46]:
dummy_input = np.random.rand(1, 66).astype(np.float32)  # Adjust 66 based on model input shape
output = model.predict(dummy_input)

# Print output order
print("\nModel Output Order:\n", output)

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 36ms/step

Model Output Order:
 [[0.02020236 0.24990867 0.28555515 0.36824885 0.05374477 0.02234011]]


[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 42ms/step

Model Output:
 [[0.0980273  0.13754573 0.26914057 0.34854153 0.09973297 0.04701192]]


In [47]:
import numpy as np

# Create test sentences
test_sentences = ["I am very happy today!", "I feel so sad and lonely.", "That was a scary experience."]

# Tokenize and pad them just like training data
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')




In [48]:
# Get predictions
predictions = model.predict(test_padded)


[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 92ms/step


In [49]:
# Apply to predictions
adjusted_predictions = np.array([adjust_softmax(pred, temperature=0.5) for pred in predictions])

# Get final predicted index
emotion_indices = np.argmax(adjusted_predictions, axis=1)

In [57]:
for sentence in test_sentences:
    print(tokenizer.texts_to_sequences([sentence]))  # See if words are properly tokenized


[[1, 24, 53, 196, 95]]
[[1, 2, 18, 307, 3, 444]]
[[8, 14, 6, 2654, 415]]


In [59]:
print(data["Emotion"].value_counts())  # Check if emotions are balanced


Emotion
joy         3000
sadness     3000
surprise    2400
love        2400
anger       2398
fear        2394
Name: count, dtype: int64


In [58]:
print(tokenizer.word_index["happy"])   # Should match the ID for "happy" in tokenized output
print(tokenizer.word_index["sad"])     # Should match the ID for "sad"
print(tokenizer.word_index["scary"])   # Should match the ID for "scary"


196
307
2654


In [62]:
print(tokenizer.word_index.get("scary"))
print(tokenizer.texts_to_sequences(["That was a scary experience."]))

2654
[[8, 14, 6, 2654, 415]]


In [51]:
for text, pred, adj_pred, idx in zip(test_sentences, predictions, adjusted_predictions, emotion_indices):
    print(f"üìù Input: {text} \nüîÆ Raw Output: {pred} \nüî• Adjusted Output: {adj_pred} \nüî¢ Predicted Index: {idx}\n")

üìù Input: I am very happy today! 
üîÆ Raw Output: [0.01891927 0.24171926 0.2693587  0.3957071  0.05078174 0.02351395] 
üî• Adjusted Output: [0.00122979 0.20074554 0.24927884 0.53798616 0.0088601  0.00189965] 
üî¢ Predicted Index: 3

üìù Input: I feel so sad and lonely. 
üîÆ Raw Output: [0.01841077 0.24687839 0.2521858  0.40637392 0.0489405  0.02721069] 
üî• Adjusted Output: [0.00115621 0.20790268 0.21693774 0.5633075  0.00817016 0.00252565] 
üî¢ Predicted Index: 3

üìù Input: That was a scary experience. 
üîÆ Raw Output: [0.01880872 0.24017277 0.23763384 0.42490643 0.05068022 0.02779799] 
üî• Adjusted Output: [0.00118558 0.1933119  0.1892464  0.60505885 0.00860772 0.00258963] 
üî¢ Predicted Index: 3



In [52]:
input_text="  I feel so sad and lonely. "
input_sequence=tokenizer.texts_to_sequences([input_text])
padded_input_sequence=pad_sequences(input_sequence,maxlen=max_length)
prediction=model.predict(padded_input_sequence)
predicted_label=le.inverse_transform([np.argmax(prediction[0])])
print(predicted_label)

[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 86ms/step
['sadness']


In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
model_path="/content/drive/My Drive/Colab Notebooks/Data/emotionn_model.h5"
model.save(model_path)



In [56]:
files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("\nEmotion distribution:\n", data["Emotion"].value_counts())
dummy_input = np.random.rand(1, 66).astype(np.float32)  # Adjust 66 based on model input shape
output = model.predict(dummy_input)

# Print output order
print("\nModel Output Order:\n", output)


Emotion distribution:
 Emotion
joy         3000
sadness     3000
surprise    2400
love        2400
anger       2398
fear        2394
Name: count, dtype: int64
[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 61ms/step

Model Output Order:
 [[0.3676894  0.15944749 0.2444347  0.14175314 0.03707249 0.04960279]]


In [None]:
print(tokenizer.texts_to_sequences(["happy", "sad", "angry"]))


[[196], [307], [263]]


In [61]:
import numpy as np

# Create test sentences
test_sentences = ["I am very happy today!", "I feel so sad and lonely.", "That was a scary experience.","I feel so intimidated by his actions"]

# Tokenize and pad them just like training data
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

# Get predictions
predictions = model.predict(test_padded)

# Convert predictions to emotion labels
predicted_labels = le.inverse_transform(np.argmax(predictions, axis=1))

# Print results
for text, label, pred in zip(test_sentences, predicted_labels, predictions):
    print(f"üìù Input: {text} \nüîÆ Predicted Emotion: {label} \nüî¢ Raw Output: {pred}\n")


[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 38ms/step
üìù Input: I am very happy today! 
üîÆ Predicted Emotion: joy 
üî¢ Raw Output: [0.00835002 0.07188448 0.5765297  0.25955978 0.07846018 0.00521597]

üìù Input: I feel so sad and lonely. 
üîÆ Predicted Emotion: sadness 
üî¢ Raw Output: [0.02014122 0.00208538 0.07085428 0.0111003  0.8931015  0.00271722]

üìù Input: That was a scary experience. 
üîÆ Predicted Emotion: joy 
üî¢ Raw Output: [0.01268039 0.40368217 0.52362907 0.04468686 0.00993078 0.00539085]

üìù Input: I feel so intimidated by his actions 
üîÆ Predicted Emotion: fear 
üî¢ Raw Output: [3.4265917e-02 9.5649606e-01 7.4476073e-03 6.7920645e-04 9.5861644e-05
 1.0152187e-03]

