In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense




In [2]:
data = pd.read_excel('Sandhi.xlsx')
data

Unnamed: 0,Sanskrit,Sandhi_split
0,गच्छामि,"गच्छ,अमि"
1,भवार्जुन,"भव,अर्जुन"
2,भीष्मासभरक्षितम्,"भीष्म,असभ,रक्षितम्"
3,पाण्डवाश्चैव,"पाण्डवास्,च,एव"
4,प्रथमोऽध्यायः,"प्रथमस्,अध्यायस"
5,ॐ,ओम
6,ज्ञानयोगेन,"ज्ञान,योगेन"
7,कृष्णैकत्वम्,"कृष्ण,एकत्वम्"
8,विद्यालयः,"विद्या,आलयः"
9,नमसीश्वरम्,"नमसि,ईश्वरम्"


In [3]:
X = data['Sanskrit']
y = data['Sandhi_split']

In [None]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X)

# Convert characters to sequences of integers
X_seq = tokenizer.texts_to_sequences(X)

# Padding sequences to ensure uniform length
X_padded = pad_sequences(X_seq)

# Label encoding for target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# Building the RNN model
vocab_size = len(tokenizer.word_index) + 1
max_len = X_padded.shape[1]

model = Sequential([
    Embedding(vocab_size, 50, input_length=max_len),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [17]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x20923c2da50>

In [18]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.00


In [19]:
example_text = ["विद्यालयः"]
example_seq = tokenizer.texts_to_sequences(example_text)
example_padded = pad_sequences(example_seq, maxlen=max_len)
prediction = model.predict(example_padded)
predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
print(f"Prediction for '{example_text[0]}': {predicted_class[0]}")

Prediction for 'विद्यालयः': भानु,उदयः
