In [6]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from sklearn.model_selection import train_test_split

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [11]:
# Load the dataset
df = pd.read_csv('edited_data.csv')

In [13]:
df.head()

Unnamed: 0,Chapter No,Verse No,Shloka,English Translation,Explanation
0,1,1,धृतराष्ट्र उवाच | धर्मक्षेत्रे कुरुक्षेत्रे सम...,"Dhritarastra said: O Sanjaya, what did my sons...",The two armies had gathered on the battlefield...
1,1,2,सञ्जय उवाच । दृष्ट्वा तु पाण्डवानीकं व्यूढं दु...,"Sanjaya said: But then, seeing the army of the...","Sanjay understood Dhritarashtra’s concern, who..."
2,1,3,पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् । व...,"O teacher, (please) see this vast army of the ...",Duryodhana asked Dronacharya to look at the sk...
3,1,4,अत्र शूरा महेष्वासा भीमार्जुनसमा युधि | युयुधा...,"There are in this army, heroes wielding great ...","Due to his anxiety, the Pandava army seemed mu..."
4,1,5,धृष्टकेतुश्चेकितान: काशिराजश्च वीर्यवान् | पुर...,"Dhrstaketu, Cekitana, and the valiant king of ...","Due to his anxiety, the Pandava army seemed mu..."


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Chapter No           700 non-null    int64 
 1   Verse No             700 non-null    int64 
 2   Shloka               700 non-null    object
 3   English Translation  700 non-null    object
 4   Explanation          700 non-null    object
dtypes: int64(2), object(3)
memory usage: 27.5+ KB


In [15]:
max_length = max([len(verse.split()) for verse in df['Shloka']])
num_classes = df['Chapter No'].nunique()

In [19]:
X = df['Shloka']
y = pd.get_dummies(df['Chapter No']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1

In [24]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=128))
model.add(Dense(units=num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')


In [26]:
#After preprocessing the data, you can train the model using the fit() method
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=10, batch_size=64)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x14904453cd0>

In [27]:
#evaluation
loss, accuracy = model.evaluate(X_test_seq_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)


Test loss: 3.188405990600586
Test accuracy: 0.08571428805589676


The test loss is 3.188 and the test accuracy is 0.086. This means that the model is not performing very well and needs improvement. The accuracy of 0.086 indicates that the model is correctly predicting the chapter number of only about 8.6% of the verses in the test set. 

Training for model's better perfomance 

In [29]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [42]:
df = pd.read_csv('edited_data.csv')
max_length = max([len(verse.split()) for verse in df['Shloka']])
num_classes = df['Chapter No'].nunique()
X = df['Shloka']
y = pd.get_dummies(df['Chapter No']).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
# Try different embedding dimensions
embedding_dim = 300

# Use pre-trained word embeddings such as GloVe or Word2Vec
# embedding_matrix = create_embedding_matrix('path/to/embeddings_file', tokenizer.word_index, embedding_dim)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=128))
model.add(Dropout(0.2))

Changing the embedding dimension can help the model performance in a few ways. The embedding layer is responsible for learning the relationship between words in the input text, and the embedding dimension represents the size of the vector that represents each word. A higher embedding dimension can capture more complex relationships between words, but can also increase the number of parameters in the model, making it more computationally expensive and potentially leading to overfitting.

In some cases, a higher embedding dimension may lead to better performance, as it allows the model to capture more nuanced relationships between words. However, the optimal embedding dimension can vary depending on the size of the dataset, the complexity of the task, and the specific language used in the input text. Therefore, it's important to experiment with different embedding dimensions to find the one that works best for a particular task.

In [43]:
# Add more dense layers after the LSTM layer to increase model capacity
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=num_classes, activation='softmax'))

In [44]:
# Use regularization techniques such as dropout to prevent overfitting
# model.add(Dropout(0.2))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [45]:
# Pad sequences to the same length
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Increase the number of epochs to allow the model to learn more from the training data
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=50, batch_size=64)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x14914115f90>

In [46]:
# evaluation
loss, accuracy = model.evaluate(X_test_seq_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 9.107884407043457
Test accuracy: 0.11428571492433548


Overfitting occured - 

In [41]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from gensim.models import KeyedVectors

# Load pre-trained word embeddings
word_vectors = KeyedVectors.load_word2vec_format('path/to/embedding/file', binary=True)

df = pd.read_csv('edited_data.csv')

max_length = max([len(verse.split()) for verse in df['Shloka']])
num_classes = df['Chapter No'].nunique()

X = df['Shloka']
y = pd.get_dummies(df['Chapter No']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 300

# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors.vocab:
        embedding_matrix[i] = word_vectors[word]

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))

# Add multiple LSTM layers and a dropout layer
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=128))

# Add dense layers with different activation functions
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile model with different optimizer and learning rate
from keras.optimizers import Adam
opt = Adam(learning_rate=0.0005)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

# Increase batch size and number of epochs
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=20, batch_size=128)

# Add a bidirectional LSTM layer and a convolutional layer
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(units=128)))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile model with different optimizer and learning rate
from keras.optimizers import SGD
opt = SGD(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

# Increase batch size and number of epochs
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=30, batch_size=256)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_seq_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)


ModuleNotFoundError: No module named 'gensim'