In [2]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
def cvtCase(s):
    return s.lower()

In [4]:
data = pd.read_csv('/Users/charishmaambati/Desktop/Cherry/Me/Elite/Dataset4E.csv',sep=',', converters={'Question': cvtCase})

In [5]:
def topicQuestion(label):
    return data.loc[(data["Topic"] == label),["Question"]]


In [6]:
def process(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [word for word in tokens]
    return (" ".join(filtered_words))

In [7]:
def preprocessedSampleQuestions(label):
    return [process(item) for item in topicQuestion(label)["Question"]]

In [8]:
def addAllTextSentence(label):
    strData  = " "
    for i in range(len((preprocessedSampleQuestions(label)))):
        strData = strData + (((preprocessedSampleQuestions(label))[i]).lower())+ " "
    return strData

In [9]:
def remTksReSentence(label):
    remRegExp = re.sub(r'[^\w]',' ',addAllTextSentence(label))
    tokens = remRegExp.split()
    return tokens

In [10]:
def preprocessedSampleQuestions(label):
    return [process(item) for item in topicQuestion(label)["Question"]]

In [11]:
labels = list(set(data["Topic"]))

In [12]:
listOfTksReWords = [remTksReSentence(label) for label in labels]

In [13]:
reCommonWords = [process(item) for item in data["Question"] if item not in listOfTksReWords]

In [14]:
reCommonWords

['ram and shyam can complete a work together in 20 days if ram alone complete the work in 36 days find the number of days shyam alone will take to complete the task',
 'ajay is twice efficient as vijay if vijay can complete the work in 48 days find the number days to complete the work if both work together',
 'vicky and arun can complete a piece of work in 15 and 10 days what percentage of the work would have got completed in 3 days',
 'vinay and vicky can complete a piece of work in 30 and 15 days repectively by working alone after how many days 80 of the work would have got completed',
 'if a and b can complete the work in 24 and 40 days find the number of days required by them to complete the work if they work together',
 'ajay is twice efficient as vijay if both work together and complete a work in 10 days find the number of days ajay will take to complete the work',
 'ram and shyam can complete a work together in 9 days if ram alone can complete the work in 12 days find the number

In [15]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()

In [18]:
def convertWV(para):
    output = []   
    for line in para:
        line = line.split(" ")
        inn = []
        for i in range(len(line)):
            try:
                x = model[line[i].lower()]
            except:
                x = np.zeros(300)
            inn.append(x)
        output.append(inn)
    w2v = [sum(element) for element in output]
    return w2v

In [19]:
epochs = 10
for epoch in range(epochs):
    print("epochs:",epoch + 1)
    questionTrain, questionTest, labelTrain, labelTest =train_test_split(reCommonWords, data["Topic"], test_size = 0.2)
    questionTrain, questionVal, labelTrain, labelVal =train_test_split(questionTrain, labelTrain,test_size = 0.1)
    lR = LogisticRegression()       
    lR = lR.fit (convertWV(questionTrain),labelTrain)
    probability = lR.predict_proba(convertWV(questionTest))
    predicted = lR.predict(convertWV(questionTest))
    print(accuracy_score(labelTest,predicted))
    

epochs: 1
0.9454545454545454
epochs: 2
0.9318181818181818
epochs: 3
0.9340909090909091
epochs: 4
0.9318181818181818
epochs: 5
0.9386363636363636
epochs: 6
0.9409090909090909
epochs: 7
0.925
epochs: 8
0.9113636363636364
epochs: 9
0.9181818181818182
epochs: 10
0.9318181818181818


In [26]:
epochs = 1000
accuracy = []
for epoch in range(epochs):
    print("epochs:",epoch + 1)
    questionTrain, questionTest, labelTrain, labelTest =train_test_split(reCommonWords, data["Topic"], test_size = 0.1)
    questionTrain, questionVal, labelTrain, labelVal =train_test_split(questionTrain, labelTrain,test_size = 0.2)
    questionTrainCv = cv.fit_transform(questionTrain)
    questionTestCv = cv.transform(questionTest)
    rFC = RandomForestClassifier(n_estimators =100, n_jobs = -1 , oob_score = True , min_samples_leaf = 1,max_features = 'log2')       
    rFC = rFC.fit ((questionTrainCv),labelTrain)
    probability = rFC.predict_proba((questionTestCv))
    predicted = rFC.predict((questionTestCv))
    (accuracy.append(accuracy_score(labelTest,predicted)))
    print((accuracy_score(labelTest,predicted)))
print("max is :", max(accuracy))
    

epochs: 1
0.9318181818181818
epochs: 2
0.9136363636363637
epochs: 3
0.9090909090909091
epochs: 4
0.9136363636363637
epochs: 5
0.9590909090909091
epochs: 6
0.9
epochs: 7
0.9227272727272727
epochs: 8
0.9181818181818182
epochs: 9
0.9409090909090909
epochs: 10
0.95
epochs: 11
0.9363636363636364
epochs: 12
0.9318181818181818
epochs: 13
0.9409090909090909
epochs: 14
0.9136363636363637
epochs: 15
0.9181818181818182
epochs: 16
0.9636363636363636
epochs: 17
0.9727272727272728
epochs: 18
0.9363636363636364
epochs: 19
0.9318181818181818
epochs: 20
0.9136363636363637
epochs: 21
0.9681818181818181
epochs: 22
0.9272727272727272
epochs: 23
0.9227272727272727
epochs: 24
0.9636363636363636
epochs: 25
0.9272727272727272
epochs: 26
0.9272727272727272
epochs: 27
0.9590909090909091
epochs: 28
0.9181818181818182
epochs: 29
0.9363636363636364
epochs: 30
0.9454545454545454
epochs: 31
0.9363636363636364
epochs: 32
0.95
epochs: 33
0.9727272727272728
epochs: 34
0.8863636363636364
epochs: 35
0.9045454545454545
ep

0.8818181818181818
epochs: 281
0.9227272727272727
epochs: 282
0.9363636363636364
epochs: 283
0.9636363636363636
epochs: 284
0.9409090909090909
epochs: 285
0.9272727272727272
epochs: 286
0.9318181818181818
epochs: 287
0.9272727272727272
epochs: 288
0.9227272727272727
epochs: 289
0.9454545454545454
epochs: 290
0.9272727272727272
epochs: 291
0.9454545454545454
epochs: 292
0.9227272727272727
epochs: 293
0.9454545454545454
epochs: 294
0.9272727272727272
epochs: 295
0.9272727272727272
epochs: 296
0.9318181818181818
epochs: 297
0.9363636363636364
epochs: 298
0.9363636363636364
epochs: 299
0.9454545454545454
epochs: 300
0.95
epochs: 301
0.9363636363636364
epochs: 302
0.9272727272727272
epochs: 303
0.9545454545454546
epochs: 304
0.9136363636363637
epochs: 305
0.9045454545454545
epochs: 306
0.9409090909090909
epochs: 307
0.9590909090909091
epochs: 308
0.95
epochs: 309
0.9545454545454546
epochs: 310
0.9181818181818182
epochs: 311
0.9363636363636364
epochs: 312
0.9272727272727272
epochs: 313
0.931

0.9136363636363637
epochs: 558
0.9409090909090909
epochs: 559
0.9409090909090909
epochs: 560
0.9227272727272727
epochs: 561
0.9590909090909091
epochs: 562
0.9318181818181818
epochs: 563
0.9227272727272727
epochs: 564
0.9454545454545454
epochs: 565
0.9409090909090909
epochs: 566
0.9454545454545454
epochs: 567
0.9363636363636364
epochs: 568
0.9454545454545454
epochs: 569
0.9045454545454545
epochs: 570
0.9318181818181818
epochs: 571
0.9363636363636364
epochs: 572
0.9272727272727272
epochs: 573
0.9181818181818182
epochs: 574
0.9136363636363637
epochs: 575
0.9363636363636364
epochs: 576
0.9
epochs: 577
0.9545454545454546
epochs: 578
0.9272727272727272
epochs: 579
0.9272727272727272
epochs: 580
0.9409090909090909
epochs: 581
0.9181818181818182
epochs: 582
0.9590909090909091
epochs: 583
0.9545454545454546
epochs: 584
0.9181818181818182
epochs: 585
0.95
epochs: 586
0.8954545454545455
epochs: 587
0.9227272727272727
epochs: 588
0.9363636363636364
epochs: 589
0.9363636363636364
epochs: 590
0.9409

0.9318181818181818
epochs: 833
0.9136363636363637
epochs: 834
0.95
epochs: 835
0.9318181818181818
epochs: 836
0.9090909090909091
epochs: 837
0.9545454545454546
epochs: 838
0.9409090909090909
epochs: 839
0.9272727272727272
epochs: 840
0.9181818181818182
epochs: 841
0.9363636363636364
epochs: 842
0.9272727272727272
epochs: 843
0.9454545454545454
epochs: 844
0.95
epochs: 845
0.9545454545454546
epochs: 846
0.9363636363636364
epochs: 847
0.9454545454545454
epochs: 848
0.9318181818181818
epochs: 849
0.9181818181818182
epochs: 850
0.9454545454545454
epochs: 851
0.95
epochs: 852
0.8909090909090909
epochs: 853
0.95
epochs: 854
0.9363636363636364
epochs: 855
0.9045454545454545
epochs: 856
0.9454545454545454
epochs: 857
0.9227272727272727
epochs: 858
0.9318181818181818
epochs: 859
0.9227272727272727
epochs: 860
0.9454545454545454
epochs: 861
0.9363636363636364
epochs: 862
0.9272727272727272
epochs: 863
0.9590909090909091
epochs: 864
0.9545454545454546
epochs: 865
0.9454545454545454
epochs: 866
0.