In [None]:
from sklearn import  preprocessing, linear_model, naive_bayes, metrics, svm,model_selection,decomposition, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
 
import numpy as np
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.utils import np_utils
from sklearn.metrics import accuracy_score, classification_report

In [None]:
dataset=pd.read_csv('/content/drive/MyDrive/data1.txt',names=['question'])

new= dataset["question"].str.split(":",expand=True)
    
x = new[1]
y= new[0]

train_x, valid_x, train_y, valid_y = train_test_split(x, y,random_state=42,stratify=y)

In [None]:
#word-level
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(x)
word_xtrain =  tfidf_vect.transform(train_x)
word_xvalid =  tfidf_vect.transform(valid_x)


le=LabelEncoder()
le.fit(y)
train_y=le.transform(train_y)
valid_y=le.transform(valid_y)

print(train_y)

[3 6 6 ... 3 0 0]


In [None]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}',ngram_range=(2,3))
tfidf_vect_ngram.fit(x)
ngram_xtrain =  tfidf_vect_ngram.transform(train_x)
ngram_xvalid=  tfidf_vect_ngram.transform(valid_x)

In [None]:
# characters level tf-idf
tfidf_vect_char = TfidfVectorizer(analyzer='char',ngram_range=(2,3))
tfidf_vect_char.fit(x)
char_xtrain =  tfidf_vect_char.transform(train_x) 
char_xvalid =  tfidf_vect_char.transform(valid_x) 


In [None]:
# count vectorizer object 
count_vect = CountVectorizer()
count_vect.fit(x)
count_xtrain=  count_vect.transform(train_x)
count_xvalid =  count_vect.transform(valid_x)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier=classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    return classifier,metrics.accuracy_score(predictions, valid_y)


In [None]:
#Word Level


NB_word,accuracy = train_model(naive_bayes.MultinomialNB(), word_xtrain, train_y, word_xvalid)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
LR_word,accuracy = train_model(linear_model.LogisticRegression(), word_xtrain, train_y, word_xvalid)
print ("LR, WordLevel TF-IDF: ", accuracy)

# SVM on Ngram Level TF IDF Vectors
SVM_word,accuracy = train_model(svm.SVC(), word_xtrain, train_y, word_xvalid)
print ("SVM,  WordLevel TF-IDF: ", accuracy)

# RF on Word Level TF IDF Vectors
RF_word,accuracy = train_model(ensemble.RandomForestClassifier(),word_xtrain, train_y, word_xvalid)
print ("RF, WordLevel TF-IDF: ", accuracy)

#decision tree
DT_word,accuracy = train_model(DecisionTreeClassifier(), word_xtrain, train_y, word_xvalid)
print ("DT, WordLevel TF-IDF: ", accuracy)

#mlp
MLP_word,accuracy = train_model(MLPClassifier(hidden_layer_sizes=(50),activation = 'relu',random_state=1), word_xtrain, train_y,word_xvalid)
print ("mlp, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.7313432835820896
LR, WordLevel TF-IDF:  0.817910447761194
SVM,  WordLevel TF-IDF:  0.8
RF, WordLevel TF-IDF:  0.808955223880597
DT, WordLevel TF-IDF:  0.7134328358208956
mlp, WordLevel TF-IDF:  0.808955223880597




In [None]:
#NGram level

NB_ngram,accuracy = train_model(naive_bayes.MultinomialNB(), ngram_xtrain, train_y, ngram_xvalid)
print ("NB, ngramLevel TF-IDF: ", accuracy)

# Linear Classifier 
LR_ngram,accuracy = train_model(linear_model.LogisticRegression(), ngram_xtrain, train_y, ngram_xvalid)
print ("LR, ngramLevel TF-IDF: ", accuracy)

# SVM 
SVM_ngram,accuracy = train_model(svm.SVC(), ngram_xtrain, train_y, ngram_xvalid)
print ("SVM,  ngramLevel TF-IDF: ", accuracy)

# RF 
RF_ngram,accuracy = train_model(ensemble.RandomForestClassifier(), ngram_xtrain, train_y,ngram_xvalid)
print ("RF, ngramLevel TF-IDF: ", accuracy)

#decision tree
DT_ngram,accuracy = train_model(DecisionTreeClassifier(), ngram_xtrain, train_y, ngram_xvalid)
print ("DT, ngramLevel TF-IDF: ", accuracy)

MLP_ngram,accuracy = train_model(MLPClassifier(hidden_layer_sizes=(50),activation = 'relu',random_state=1),ngram_xtrain, train_y,ngram_xvalid)
print ("mlp, ngramLevel TF-IDF: ", accuracy)

NB, ngramLevel TF-IDF:  0.7343283582089553
LR, ngramLevel TF-IDF:  0.7134328358208956
SVM,  ngramLevel TF-IDF:  0.7223880597014926
RF, ngramLevel TF-IDF:  0.764179104477612
DT, ngramLevel TF-IDF:  0.7134328358208956
mlp, ngramLevel TF-IDF:  0.7164179104477612


In [None]:
#Character Level
 
NB_char,accuracy = train_model(naive_bayes.MultinomialNB(), char_xtrain, train_y, char_xvalid)
print ("NB, characterLevel TF-IDF: ", accuracy)
 
 
LR_char,accuracy = train_model(linear_model.LogisticRegression(),char_xtrain, train_y,char_xvalid)
print ("LR,character TF-IDF: ", accuracy)
 
SVM_char,accuracy = train_model(svm.SVC(),char_xtrain, train_y, char_xvalid)
print ("SVM, characterTF-IDF: ", accuracy)
 
 
RF_char,accuracy = train_model(ensemble.RandomForestClassifier(), char_xtrain, train_y, char_xvalid)
print ("RF, characterTF-IDF: ", accuracy)
 
DT_char,accuracy = train_model(DecisionTreeClassifier(), char_xtrain, train_y, char_xvalid)
print ("DT,characterTF-IDF: ", accuracy)
 
MLP_char,accuracy = train_model(MLPClassifier(hidden_layer_sizes=(50),activation = 'relu',random_state=1),char_xtrain, train_y,char_xvalid)
print ("mlp, CharLevel TF-IDF: ", accuracy)

NB, characterLevel TF-IDF:  0.7731343283582089
LR,character TF-IDF:  0.8238805970149253
SVM, characterTF-IDF:  0.8388059701492537
RF, characterTF-IDF:  0.844776119402985
DT,characterTF-IDF:  0.7522388059701492
mlp, CharLevel TF-IDF:  0.8686567164179104


In [None]:
 #Count Vectorizer
 
 #Naive Bayes 
NB_count,accuracy = train_model(naive_bayes.MultinomialNB(), count_xtrain, train_y, count_xvalid)
print ("NB, countvec: ", accuracy)

# Linear Classifier 
LR_count,accuracy = train_model(linear_model.LogisticRegression(), count_xtrain, train_y,count_xvalid)
print ("LR,countvec: ", accuracy)

# SVM 
SVM_count,accuracy = train_model(svm.SVC(),count_xtrain, train_y,count_xvalid)
print ("SVM, countvec: ", accuracy)

# RF 
RF_count,accuracy = train_model(ensemble.RandomForestClassifier(),count_xtrain, train_y,count_xvalid)
print ("RF,countvec: ", accuracy)

#decision tree
DT_count,accuracy = train_model(DecisionTreeClassifier(),count_xtrain, train_y,count_xvalid)
print ("DT, countvec: ", accuracy)

#mlp
MLP_count,accuracy = train_model(MLPClassifier(hidden_layer_sizes=(50),activation = 'relu',random_state=1),count_xtrain, train_y,count_xvalid)
print ("mlp, Countvec: ", accuracy)

NB, countvec:  0.7671641791044777
LR,countvec:  0.7880597014925373
SVM, countvec:  0.7492537313432835
RF,countvec:  0.7791044776119403
DT, countvec:  0.7164179104477612
mlp, Countvec:  0.746268656716418




In [None]:
encoder = preprocessing.LabelEncoder()
trainLabels = encoder.fit_transform(train_y)
trainLabels = [np_utils.to_categorical(i, num_classes=8) for i in trainLabels]
trainLabels = np.asarray(trainLabels)

validLabels = encoder.fit_transform(valid_y)
validLabels = [np_utils.to_categorical(i, num_classes=8) for i in validLabels]
validLabels = np.asarray(validLabels)





In [None]:
# Using FastText pre trained telugu embeddings
embeddings_index = {}
import numpy as np
from keras.preprocessing import text, sequence

for i, line in enumerate(open('/content/drive/MyDrive/cc.te.300.vec',encoding="utf-8")):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')


In [None]:
token = text.Tokenizer()
token.fit_on_texts(x)
word_index = token.word_index

train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x),maxlen=32)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x),maxlen=32)

embedding_matrix = np.zeros((len(word_index)+1, 300))

for word,i in word_index.items():
        embedding_vector = embeddings_index.get(word)    # checking that particular indexed word in telugu embedding .vec file
        if embedding_vector is not None:                 # if it is found in that .vec file  
            embedding_matrix[i] = embedding_vector 



In [None]:
def create_model(model):
    # Add an Input Layer
    input_layer = layers.Input((32, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.25)(embedding_layer)

    if model==1:
      #Add the convolutional layer
       conv_layer = layers.Convolution1D(256, 3, activation="tanh")(embedding_layer)
    
      #Add the pooling layer
       layer = layers.GlobalMaxPool1D()(conv_layer)
    elif model==2:  
       layer = layers.LSTM(100)(embedding_layer)
     
    elif model==3:
      layer = layers.GRU(100)(embedding_layer)

    elif model==4:
       layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(128, activation="tanh")(layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(8, activation="softmax")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

    return model


In [None]:
cnn=create_model(model=1)
lstm=create_model(model=2)
gru=create_model(model=3)
birnn=create_model(model=4)

In [None]:
cnn.fit(train_seq_x, trainLabels, epochs=10)

predictions = cnn.predict(valid_seq_x)
predictions1 = predictions.argmax(axis=-1)
validLabels1 = validLabels.argmax(axis=-1)

acc = metrics.accuracy_score(predictions1, validLabels1)

print('cnn ',acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
cnn  0.8417910447761194


In [None]:
lstm.fit(train_seq_x, trainLabels, epochs=10)

predictions = lstm.predict(valid_seq_x)
predictions1 = predictions.argmax(axis=-1)

acc = metrics.accuracy_score(predictions1, validLabels1)

print('LSTM ',acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LSTM  0.8388059701492537


In [None]:
gru.fit(train_seq_x, trainLabels, epochs=10)

predictions = gru.predict(valid_seq_x)
predictions1 = predictions.argmax(axis=-1)

acc = metrics.accuracy_score(predictions1, validLabels1)

print('GRU ',acc)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
GRU  0.8477611940298507


In [None]:
birnn.fit(train_seq_x, trainLabels, epochs=10)

predictions = birnn.predict(valid_seq_x)
predictions1 = predictions.argmax(axis=-1)

acc = metrics.accuracy_score(predictions1, validLabels1)

print('BI RNN ',acc)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
BI RNN  0.8417910447761194


In [None]:
input1=['“జై జవాన్ జై కిసాన్” నినాదాన్ని ఎవరు రూపొందించారు?']
valid_seq1 = sequence.pad_sequences(token.texts_to_sequences(input1),maxlen=32)

predict=cnn.predict(valid_seq1)
predict = predict.argmax(axis=-1)
print('CNN ',le.inverse_transform(predict))

predict=lstm.predict(valid_seq1)
predict = predict.argmax(axis=-1)
print('LSTM ',le.inverse_transform(predict))

predict=gru.predict(valid_seq1)
predict = predict.argmax(axis=-1)
print('GRU ',le.inverse_transform(predict))

predict=birnn.predict(valid_seq1)
predict = predict.argmax(axis=-1)
print('BI RNN ',le.inverse_transform(predict))

CNN  ['PERS']
LSTM  ['PERS']
GRU  ['PERS']
BI RNN  ['PERS']


In [None]:
word_xvalid1 =  tfidf_vect.transform(input1)

print("NB Word level",le.inverse_transform(NB_word.predict(word_xvalid1)))
print("LR Word level",le.inverse_transform(LR_word.predict(word_xvalid1)))
print("DT Word level",le.inverse_transform(DT_word.predict(word_xvalid1)))
print("RF Word level",le.inverse_transform(RF_word.predict(word_xvalid1)))
print("MLP Word level",le.inverse_transform(MLP_word.predict(word_xvalid1)))

word_xvalid1 =  tfidf_vect_ngram.transform(input1)

print("\nNB Ngram level",le.inverse_transform(NB_ngram.predict(word_xvalid1)))
print("LR Ngram level",le.inverse_transform(LR_ngram.predict(word_xvalid1)))
print("DT Ngram level",le.inverse_transform(DT_ngram.predict(word_xvalid1)))
print("RF Ngram level",le.inverse_transform(RF_ngram.predict(word_xvalid1)))
print("MLP Ngram level",le.inverse_transform(MLP_ngram.predict(word_xvalid1)))


word_xvalid1 =  tfidf_vect_char.transform(input1)

print("\nNB Char level",le.inverse_transform(NB_char.predict(word_xvalid1)))
print("LR Char level",le.inverse_transform(LR_char.predict(word_xvalid1)))
print("DT Char level",le.inverse_transform(DT_char.predict(word_xvalid1)))
print("RF Char level",le.inverse_transform(RF_char.predict(word_xvalid1)))
print("MLP Char level",le.inverse_transform(MLP_char.predict(word_xvalid1)))


word_xvalid1 =  count_vect.transform(input1)

print("\nNB CountVector",le.inverse_transform(NB_count.predict(word_xvalid1)))
print("LR Count vector",le.inverse_transform(LR_count.predict(word_xvalid1)))
print("DT Count vector",le.inverse_transform(DT_count.predict(word_xvalid1)))
print("RF Count vector",le.inverse_transform(RF_count.predict(word_xvalid1)))
print("MLP Count vector",le.inverse_transform(MLP_count.predict(word_xvalid1)))

NB Word level ['PERS']
LR Word level ['PERS']
DT Word level ['PERS']
RF Word level ['PERS']
MLP Word level ['PERS']

NB Ngram level ['PERS']
LR Ngram level ['PERS']
DT Ngram level ['PERS']
RF Ngram level ['PERS']
MLP Ngram level ['PERS']

NB Char level ['PERS']
LR Char level ['PERS']
DT Char level ['PERS']
RF Char level ['PERS']
MLP Char level ['PERS']

NB CountVector ['PERS']
LR Count vector ['PERS']
DT Count vector ['PERS']
RF Count vector ['PERS']
MLP Count vector ['PERS']


In [None]:
pip freeze>req.txt