In [1]:
import numpy as np
import nltk
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from sklearn.feature_extraction.text import TfidfVectorizer 
import heapq

In [2]:
with open('python.txt') as f:
    txt = str(f.readlines())

In [3]:
print(type(txt))

<class 'str'>


In [4]:
#class for preprocessing and creating word embedding
class Preprocessing:
    #constructor
    def __init__(self,txt):
        # Tokenization
        nltk.download('punkt')  #punkt is nltk tokenizer 
        # breaking text to sentences
        tokens = nltk.sent_tokenize(txt) 
        self.tokens = tokens
        self.tfidfvectoriser=TfidfVectorizer()

    # Data Cleaning
    # remove extra spaces
    # convert sentences to lower case 
    # remove stopword
    
    def clean_sentence(self, sentence, stopwords=False):
        sentence = sentence.lower().strip()
        sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
        if stopwords:
            sentence = remove_stopwords(sentence)
        return sentence

    # store cleaned sentences to cleaned_sentences
    def get_cleaned_sentences(self,tokens, stopwords=False):
        cleaned_sentences = []
        for line in tokens:
            cleaned = self.clean_sentence(line, stopwords)
            cleaned_sentences.append(cleaned)
        return cleaned_sentences
    
      #do all the cleaning
    def cleanall(self):
        cleaned_sentences = self.get_cleaned_sentences(self.tokens, stopwords=True)
        cleaned_sentences_with_stopwords = self.get_cleaned_sentences(self.tokens, stopwords=False)
        # print(cleaned_sentences)
        # print(cleaned_sentences_with_stopwords)
        return [cleaned_sentences,cleaned_sentences_with_stopwords]

    # TF-IDF Vectorizer
    def TFIDF(self,cleaned_sentences):
        self.tfidfvectoriser.fit(cleaned_sentences)
        tfidf_vectors=self.tfidfvectoriser.transform(cleaned_sentences)
        return tfidf_vectors

    #tfidf for question
    def TFIDF_Q(self,question_to_be_cleaned):
        tfidf_vectors=self.tfidfvectoriser.transform([question_to_be_cleaned])
        return tfidf_vectors

    # main call function
    def doall(self):
        cleaned_sentences, cleaned_sentences_with_stopwords = self.cleanall()
        tfidf = self.TFIDF(cleaned_sentences)
        return [cleaned_sentences,cleaned_sentences_with_stopwords,tfidf]

In [5]:
#class for answering the question.
class AnswerMe:
    #cosine similarity
    def Cosine(self, question_vector, sentence_vector):
        dot_product = np.dot(question_vector, sentence_vector.T)
        denominator = (np.linalg.norm(question_vector) * np.linalg.norm(sentence_vector))
        return dot_product/denominator
    
    #Euclidean distance
    def Euclidean(self, question_vector, sentence_vector):
        vec1 = question_vector.copy()
        vec2 = sentence_vector.copy()
        if len(vec1)<len(vec2): vec1,vec2 = vec2,vec1
        vec2 = np.resize(vec2,(vec1.shape[0],vec1.shape[1]))
        return np.linalg.norm(vec1-vec2)

    # main call function
    def answer(self, question_vector, sentence_vector, method):
        if method==1: return self.Euclidean(question_vector,sentence_vector)
        else: return self.Cosine(question_vector,sentence_vector)

In [6]:
def RetrieveAnswer(question_embedding, tfidf_vectors,method=1):
    similarity_heap = []
    if method==1: max_similarity = float('inf')
    else: max_similarity = -1
    index_similarity = -1

    for index, embedding in enumerate(tfidf_vectors):  
        find_similarity = AnswerMe()
        similarity = find_similarity.answer((question_embedding).toarray(),(embedding).toarray() , method).mean()
        if method==1:
            heapq.heappush(similarity_heap,(similarity,index))
        else:
            heapq.heappush(similarity_heap,(-similarity,index))
            
    return similarity_heap

In [None]:
from flask import Flask, render_template,request,flash,redirect

app = Flask(__name__, template_folder='templates')


@app.route('/')
def hello():
    return render_template('pythonQ&A.html')

@app.route('/Send', methods=['POST'])
def getdata():
    user_question = request.form['user[title]']
    
    preprocess = Preprocessing(txt)
    cleaned_sentences,cleaned_sentences_with_stopwords,tfidf_vectors = preprocess.doall()

    question = preprocess.clean_sentence(user_question, stopwords=True)
    question_embedding = preprocess.TFIDF_Q(question)

    method = 1
    
    similarity_heap = RetrieveAnswer(question_embedding , tfidf_vectors ,method)
    number_of_sentences_to_print = 1
    while number_of_sentences_to_print>0 and len(similarity_heap)>0:
        x = similarity_heap.pop(0)
        answer = cleaned_sentences_with_stopwords[x[1]]
        answer=answer.capitalize() 
        number_of_sentences_to_print-=1
    
    return render_template('pythonQ&A.html', inp=answer )
if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 9000, app)

 * Running on http://localhost:9000/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Jun/2022 05:23:01] "[37mGET / HTTP/1.1[0m" 200 -
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rithulaa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
127.0.0.1 - - [03/Jun/2022 05:23:09] "[37mPOST /Send HTTP/1.1[0m" 200 -
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rithulaa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
127.0.0.1 - - [03/Jun/2022 05:27:37] "[37mPOST /Send HTTP/1.1[0m" 200 -
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rithulaa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
127.0.0.1 - - [03/Jun/2022 05:29:08] "[37mPOST /Send HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Jun/2022 05:29:38] "[37mGET / HTTP/1.1[0m" 200 -
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rithulaa\AppData\Roaming\nltk_data...
[nltk_data]