In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# To scrape Wikipedia
from bs4 import BeautifulSoup
# To access contents from URLs
import requests
# to preprocess text
import nltk
# to handle punctuations
from string import punctuation
# TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# cosine similarity score
from sklearn.metrics.pairwise import cosine_similarity 
# to do array operations
import numpy as np
# to have sleep option
from time import sleep 

In [None]:
class ChatBot():
    
    # initialize bot
    def __init__(self):
        # flag whether to end chat
        self.end_chat = False
        # flag whether topic is found in wikipedia
        self.got_topic = False
        # flag whether to call respond()
        # in some cases, response be made already
        self.do_not_respond = True
        
        # wikipedia title
        self.title = None
        # wikipedia scraped data as paragraphs
        self.text_data = []
        # data as sentences
        self.sentences = []
        # to keep track of paragraph indices
        # corresponding to all sentences
        self.para_indices = []
        # currently retrieved sentence id
        self.current_sent_idx = None
		
		        
        # a punctuation dictionary
        self.punctuation_dict = str.maketrans({p:None for p in punctuation})
        # wordnet lemmatizer for preprocessing text
        self.lemmatizer = nltk.stem.WordNetLemmatizer()
        # collection of stopwords
        self.stopwords = nltk.corpus.stopwords.words('english')
        # initialize chatting
        self.greeting()

    # greeting method - to be called internally
    # chatbot initializing chat on screen with greetings
    def greeting(self):
        print("ChatBot ...")
        # some time to get user ready
        sleep(2)
        # chat ending tags
        print('Shkruaj "bye" ose "quit" ose "exit" per te mbyllur biseden')
        sleep(2)
        # chatbot descriptions
        print('\nShkruani temen e kerkuar.')
        sleep(3)
        print('ChatBot do pergjigjet. \
        \nNese shkruani "more",do jape me teper detaje \
        \nNese shkruani "jump",do te kaloje ne info tjeter')
        # give time to read what has been printed
        sleep(3)
        print('-'*50)
        # Greet and introduce
        greet = "Pershendetje,me thoni temen e interesit. "
        print("ChatBot >>  " + greet)
		

    def chat(self):
        # continue chat
        while not self.end_chat:
            # receive input
            self.receive_input()
            # finish chat if opted by user
            if self.end_chat:
                print('ChatBot >>  Shihemi se shpejti! Bye!')
                sleep(2)
                print('\nDuke u mbyllur ChatBot ...')
            # if data scraping successful
            elif self.got_topic:
                # in case not already responded
                if not self.do_not_respond:
                    self.respond()
                # clear flag so that bot can respond next time
                self.do_not_respond = False
				
	# receive_input method - to be called internally
    # recieves input from user and makes preliminary decisions
    def receive_input(self):
        # receive input from user
        text = input("User    >> ")
        # end conversation if user wishes so
        if text.lower().strip() in ['bye', 'quit', 'exit']:
            # turn flag on 
            self.end_chat=True
        # if user needs more information 
        elif text.lower().strip() == 'more':
            # respond here itself
            self.do_not_respond = True
            # if at least one query has been received 
            if self.current_sent_idx != None:
                response = self.text_data[self.para_indices[self.current_sent_idx]]
            # prompt user to start querying
            else:
                response = "Shkruaj input-in tuaj!"
            print("ChatBot >> " + response)
        # if topic is not chosen
        elif not self.got_topic:
            self.scrape_wiki(text)
        else:
            # add user input to sentences, so that we can vectorize in whole
            self.sentences.append(text)
	
	# respond method - to be called internally
    def respond(self):
        # tf-idf-modeling
        vectorizer = TfidfVectorizer(tokenizer=self.preprocess)
        # fit data and obtain tf-idf vector
        tfidf = vectorizer.fit_transform(self.sentences)
        # calculate cosine similarity scores
        scores = cosine_similarity(tfidf[-1],tfidf) 
        # identify the most closest sentence
        self.current_sent_idx = scores.argsort()[0][-2]
        # find the corresponding score value
        scores = scores.flatten()
        scores.sort()
        value = scores[-2]
        # if there is matching sentence
        if value != 0:
            print("ChatBot >> " + self.sentences[self.current_sent_idx]) 
        # if no sentence is matching the query
        else:
            print("ChatBot >>  Shkruaj!" )
        # remove the user query from sentences
        del self.sentences[-1]
		
	        
    # scrape_wiki method - to be called internally.
    # called when user inputs topic of interest.
    # employs requests to access Wikipedia via URL.
    # employs BeautifulSoup to scrape paragraph tagged data
    # and h1 tagged article heading.
    # employs NLTK to tokenize data
    def scrape_wiki(self,topic):
        # process topic as required by Wikipedia URL system
        topic = topic.lower().strip().capitalize().split(' ')
        topic = '_'.join(topic)
        try:
            # creata an url
            link = 'https://en.wikipedia.org/wiki/'+ topic
            # access contents via url
            data = requests.get(link).content
            # parse data as soup object
            soup = BeautifulSoup(data, 'html.parser')
            # extract all paragraph data
            # scrape strings with html tag 'p'
            p_data = soup.findAll('p')
            # scrape strings with html tag 'dd'
            dd_data = soup.findAll('dd')
            # scrape strings with html tag 'li'
            #li_data = soup.findAll('li')
            p_list = [p for p in p_data]
            dd_list = [dd for dd in dd_data]
            #li_list = [li for li in li_data]
            # iterate over all data
            for tag in p_list+dd_list: #+li_list:
                # a bucket to collect processed data
                a = []
                # iterate over para, desc data and list items contents
                for i in tag.contents:
                    # exclude references, superscripts, formattings
                    if i.name != 'sup' and i.string != None:
                        stripped = ' '.join(i.string.strip().split())
                        # collect data pieces
                        a.append(stripped)
                # with collected string pieces formulate a single string
                # each string is a paragraph
                self.text_data.append(' '.join(a))
            
				
            # obtain sentences from paragraphs
            for i,para in enumerate(self.text_data):
                sentences = nltk.sent_tokenize(para)
                self.sentences.extend(sentences)
                # for each sentence, its para index must be known
                # it will be useful in case user prompts "more" info
                index = [i]*len(sentences)
                self.para_indices.extend(index)
            
            # extract h1 heading tag from soup object
            self.title = soup.find('h1').string
            # turn respective flag on
            self.got_topic = True
            # announce user that chatbot is ready now
            print('ChatBot >>  Tema ne "Wikipedia: {}". Let\'s chat!'.format(self.title)) 
        # in case of unavailable topics
        except Exception as e:
            print('ChatBot >>  Error: {}. \
            Vendosni nje teme tjeter!'.format(e))
        
		
    # preprocess method - to be called internally by Tf-Idf vectorizer
    # text preprocessing, stopword removal, lemmatization, word tokenization
    def preprocess(self, text):
        # remove punctuations
        text = text.lower().strip().translate(self.punctuation_dict) 
        # tokenize into words
        words = nltk.word_tokenize(text) 
        # remove stopwords
        words = [w for w in words if w not in self.stopwords]
        # lemmatize 
        return [self.lemmatizer.lemmatize(w) for w in words]
		

# Happy Chatting!
# Initialize ChatBot and start chatting.
		
if __name__ == '__main__':
	# instantiate an object
	wiki = ChatBot()
	# call chat method
	wiki.chat()
# end of script

ChatBot ...
Shkruaj "bye" ose "quit" ose "exit" per te mbyllur biseden

Shkruani temen e kerkuar.
ChatBot do pergjigjet.         
Nese shkruani "more",do jape me teper detaje         
Nese shkruani "jump",do te kaloje ne info tjeter
--------------------------------------------------
ChatBot >>  Pershendetje,me thoni temen e interesit. 
User    >> Albania
ChatBot >>  Tema ne "Wikipedia: Albania". Let's chat!
User    >> Climate
ChatBot >> Climate change is predicted to have serious effects on the living conditions in Albania.
User    >> more
ChatBot >> Environmental issues in Albania include air and water pollution , climate change , waste management , biodiversity loss and nature conservation . Climate change is predicted to have serious effects on the living conditions in Albania. The country is recognised as vulnerable to climate change impacts, ranked 80 among 181 countries in the Notre Dame Global Adaptation Index of 2019. Factors that account for the country's vulnerability to clim