In [34]:
import pandas as pd
from random import shuffle
import re
import numpy as np

In [35]:
dataorg = pd.read_csv("hintest.csv")

In [36]:
pd.isnull(dataorg).sum()

marwadi    0
hindi      3
dtype: int64

In [37]:
dataorg = dataorg.dropna()

In [39]:
data = pd.DataFrame()
data["hindi_sentance"] = dataorg["hindi"]
data["marwadi_sentence"] = dataorg["marwadi"]
data.head(10)

Unnamed: 0,hindi_sentance,marwadi_sentence
0,अच्छा,आछौ
1,आम,आंबौ
2,आकाश,आब
3,बहुत दूर,फिर
4,बचाओ!,बचा
5,कूदो,कूद
6,नमस्कार।,राम राम सा
7,वाह-वाह!,वाह-वाह
8,समझे कि नहीं?,समझे मे आयो
9,मैं ठीक हूँ।,मे चॉको हु


In [71]:
data

Unnamed: 0,hindi_sentance,marwadi_sentence
0,START_ अच्छा _END,आछौ
1,START_ आम _END,आंबौ
2,START_ आकाश _END,आब
3,START_ बहुत दूर _END,फिर
4,START_ बचाओ _END,बचा
...,...,...
991,START_ तुम्हारी शादी कब हुई थी _END,थारी शादी हुगी
992,START_ आप वहाँ जाएँगे क्या _END,थे बटे जावो काईं
993,START_ हाँ मुझे यह बहुत पसंद है। _END,हा मन ए कल्डा चोखा लागे
994,START_ आपको लाईन में लगकर इंतेज़ार करना पड़ेगा...,थाने लाईन में लागर अढिकणो पङी


In [40]:
data["hindi_sentance"].apply(lambda x: x.lower())
data["marwadi_sentence"].apply(lambda x: x.lower())

0                                 आछौ
1                               आंबौ 
2                                 आब 
3                                 फिर
4                                 बचा
                    ...              
991                   थारी शादी हुगी?
992                 थे बटे जावो काईं?
993          हा मन ए कल्डा चोखा लागे.
994    थाने लाईन में लागर अढिकणो पङी.
995               तु मन पागल कर दियो.
Name: marwadi_sentence, Length: 993, dtype: object

In [41]:
data["hindi_sentance"].apply(lambda x: re.sub("''",'',x))
data["marwadi_sentence"].apply(lambda x: re.sub("''",'',x))

0                                 आछौ
1                               आंबौ 
2                                 आब 
3                                 फिर
4                                 बचा
                    ...              
991                   थारी शादी हुगी?
992                 थे बटे जावो काईं?
993          हा मन ए कल्डा चोखा लागे.
994    थाने लाईन में लागर अढिकणो पङी.
995               तु मन पागल कर दियो.
Name: marwadi_sentence, Length: 993, dtype: object

In [42]:
import string
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
data['hindi_sentance']=data['hindi_sentance'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data['marwadi_sentence']=data['marwadi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [43]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', string.digits)
data['hindi_sentance']=data['hindi_sentance'].apply(lambda x: x.translate(remove_digits))
data['marwadi_sentence']=data['marwadi_sentence'].apply(lambda x: x.translate(remove_digits))

data['hindi_sentance'] = data['hindi_sentance'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))
data['marwadi_sentence'] = data['marwadi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
data['hindi_sentance']=data['hindi_sentance'].apply(lambda x: x.strip())
data['marwadi_sentence']=data['marwadi_sentence'].apply(lambda x: x.strip())
data['hindi_sentance']=data['hindi_sentance'].apply(lambda x: re.sub(" +", " ", x))
data['marwadi_sentence']=data['marwadi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [44]:
data['hindi_sentance'] = data['hindi_sentance'].apply(lambda x : 'START_ '+ x + ' _END')

In [45]:
### Get Hinid and Marwadi Vocabulary
all_hin_words=set()
for eng in data['hindi_sentance']:
    for word in eng.split():
        if word not in all_hin_words:
            all_hin_words.add(word)

all_mar_words=set()
for hind in data['marwadi_sentence']:
    for word in hind.split():
        if word not in all_mar_words:
            all_mar_words.add(word)

In [46]:
x_vocab = len(all_hin_words)

In [47]:
y_vocab = len(all_mar_words)

In [48]:
MAX_HIN_LEN = 0
for i in data["hindi_sentance"]:
    if len(i.split())>MAX_HIN_LEN:
        MAX_HIN_LEN = len(i.split())
MAX_HIN_LEN

14

In [49]:
MAX_MAR_LEN = 0
for i in data["marwadi_sentence"]:
    if len(i.split())>MAX_MAR_LEN:
        MAX_MAR_LEN = len(i.split())
MAX_MAR_LEN

10

In [50]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_tokenizer = Tokenizer(num_words=x_vocab)
x_tokenizer.fit_on_texts(data["hindi_sentance"].values)
X = x_tokenizer.texts_to_sequences(data["hindi_sentance"].values)
X = pad_sequences(X, maxlen=MAX_HIN_LEN, padding="post")
X.shape

(993, 14)

In [51]:
y_tokenizer = Tokenizer(num_words=y_vocab)
y_tokenizer.fit_on_texts(data["marwadi_sentence"].values)
Y = y_tokenizer.texts_to_sequences(data["marwadi_sentence"].values)
Y = pad_sequences(Y, maxlen=MAX_MAR_LEN, padding="post")
Y.shape

(993, 10)

In [52]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(X, Y, train_size=0.1, random_state=42, shuffle=True)

In [53]:
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras import Input
from tensorflow.keras.models import Model

#making encoder model
embedded_dim = 100
latent_dim = 300

encoder_input = Input(shape=(MAX_HIN_LEN,))
encoder_emb_layer = Embedding(x_vocab, embedded_dim)
encoder_emb = encoder_emb_layer(encoder_input)
encoder_lstm = LSTM(latent_dim, recurrent_dropout=0.4, dropout=0.4, return_sequences=True, return_state=True)
encoder_output, state_h, state_c = encoder_lstm(encoder_emb)

#decoding layer
decoder_input = Input(shape=(None,))
decoder_emb_layer = Embedding(y_vocab, embedded_dim)
decoder_emb = decoder_emb_layer(decoder_input)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_output, decoder_state_h, decoder_state_c = decoder_lstm(decoder_emb, initial_state=[state_h, state_c])

decoder_dense = TimeDistributed(Dense(y_vocab, activation="softmax"))
decoder_output = decoder_dense(decoder_output)

model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_output)

In [54]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [55]:
model.fit([x_tr,y_tr[:,:-1]],y_tr.reshape(y_tr.shape[0],y_tr.shape[1],1)[:,1:], epochs=500, batch_size=256, validation_data=([x_val,y_val[:,:-1]],y_val.reshape(y_val.shape[0],y_val.shape[1],1)[:,1:]))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x22237209810>

In [64]:
#making inference model for prediction

#for encoder model
encoder_model = Model(inputs=[encoder_input], outputs=[encoder_output,state_h,state_c])

decoder_internal_state_h = Input(shape=(latent_dim,))
decoder_internal_state_c = Input(shape=(latent_dim,))
decoder_internal_states = [decoder_internal_state_h, decoder_internal_state_c]


dec_emb_2 = decoder_emb_layer(decoder_input)
decoder_output, state_h2, state_c2 = decoder_lstm(dec_emb_2, initial_state=[decoder_internal_state_h, decoder_internal_state_c])
decoder_output = decoder_dense(decoder_output)
#for decoder model
decoder_model = Model(inputs=[decoder_input,decoder_internal_states], outputs=[decoder_output,state_h2,state_c2])

In [65]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [66]:
def decode_sequence(input_seq):
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = 1  # Use the index 1 for the 'start' token directly
    
    stop_condition = False
    decoded_seq = ''
    
    while not stop_condition:
        output_tokens, state_h2, state_c2 = decoder_model.predict([target_seq, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens)
        
        if sampled_token_index == 0:  # Break the loop if the sampled token index is 0 (padding token)
            break
        
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if sampled_token != 'end':
            decoded_seq += ' ' + sampled_token
        
        if sampled_token == 'end' or len(decoded_seq.split()) >= MAX_HIN_LEN:
            stop_condition = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        e_h, e_c = state_h2, state_c2
    
    return decoded_seq


In [68]:
def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [69]:
def seq2mar(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

In [72]:
for i in range(1,50):  # Adjust the range based on the number of examples you want to test
    print('Input Hindi sentence:', seq2text(x_tr[i]))
    print("Real Marwadi translation: ", seq2mar(y_tr[i]))
    print("Predicted summary:", decode_sequence(x_tr[i].reshape(1, MAX_HIN_LEN)))
    print("\n")


Input Hindi sentence: start मैं आसपड़ोस में रहता हूँ। end 
Real Marwadi translation:  मैं अट कने ही रु 
Predicted summary:  मन रो चोखो है


Input Hindi sentence: start वह बीमार नहीं हो सकता। end 
Real Marwadi translation:  बो बीमार कोनी हु सके 
Predicted summary:  करो बिंया है


Input Hindi sentence: start उसको पैसों की कमी थी। end 
Real Marwadi translation:  बिन रिप्या की कमी ही। 
Predicted summary:  रिप्या रो सोरो है


Input Hindi sentence: start मुझे पता है उसने क्यों किया था। end 
Real Marwadi translation:  मने धयन है बि काई करियो 
Predicted summary:  एक में हु है


Input Hindi sentence: start वह बस आता ही होगा। end 
Real Marwadi translation:  बो बस आतो ही हुगो 
Predicted summary:  एक में ही है


Input Hindi sentence: start मैं तुम्हारे जितना लम्बा हूँ। end 
Real Marwadi translation:  मैं थारे जीतो लम्बो हू 
Predicted summary:  रिप्या रो सोरो है


Input Hindi sentence: start कर भला तो हो भला। end 
Real Marwadi translation:  करो भलो हुई भलो 
Predicted summary:  भलो हुई


Input Hindi