In [1]:
#prepare the data
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file into a Pandas dataframe
df = pd.read_csv('edited_data.csv')

# Split the data into training and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)


In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer for the transformer model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Load the pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')


Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 4.54kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:01<00:00, 227kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:02<00:00, 221kB/s]
Downloading pytorch_model.bin: 100%|██████████| 268M/268M [11:54<00:00, 375kB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you 

In [10]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [11]:
# Load the dataset
df = pd.read_csv('edited_data.csv')
max_length = max([len(verse.split()) for verse in df['Shloka']])
num_classes = df['Chapter No'].nunique()
X = df['Shloka']
y = pd.get_dummies(df['Chapter No']).values

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the input data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


In [13]:
# Pad sequences to the same length
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Define the model architecture
embedding_dim = 300
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_classes, activation='softmax'))

In [14]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=20, batch_size=64)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x17ec6e91610>

In [15]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_seq_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 5.889999866485596
Test accuracy: 0.12857143580913544


In [20]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from gensim.models import KeyedVectors

# Load the dataset
df = pd.read_csv('edited_data.csv')
max_length = max([len(verse.split()) for verse in df['Shloka']])
num_classes = df['Chapter No'].nunique()
X = df['Shloka']
y = pd.get_dummies(df['Chapter No']).values



In [24]:
# Load the embeddings
word_vectors = {}
with open('glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = vector

In [25]:
# Tokenize the input data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [26]:
# Pad sequences to the same length
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Create the embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

In [27]:
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_classes, activation='softmax'))

In [28]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
# Train the model
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x17ef23b4210>

In [30]:
loss, accuracy = model.evaluate(X_test_seq_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

Test loss: 2.860588312149048
Test accuracy: 0.12857143580913544


In [None]:

# Load the embeddings
word_vectors = {}
with open('glove/glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = vector

# Tokenize the input data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Create the embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), epochs=20, batch_size=64)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_seq_padded, y_test)
print('Test loss:', loss)
print('Test accuracy:', accuracy)


In [21]:
# Download pre-trained GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d glove

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [18]:
from gensim.models import KeyedVectors