In [1]:
import nltk #to process natural language
import numpy as np
import random # to generte random responses to feel less repetitive
import string # to do string manipulations, because sometimes we need to remove punctuations,need to convert it in lowercase etc.

In [2]:
f = open('Machine learning.txt', 'r', errors='ignore')
raw = f.read()

In [3]:
nltk.data.path.append('/home/nithin/nltk_data')  # Add your nltk_data path
nltk.download('punkt_tab')
raw=raw.lower()# convert all strings to lowercase
nltk.download('punkt')# Punkt is a pre-trained model that helps in sentence tokenization, i.e., splitting a text into sentences
nltk.download('wordnet')#WordNet is a large database of English words that groups words into sets of synonyms 
sentence_tokens = nltk.sent_tokenize(raw)
word_tokens = nltk.word_tokenize(raw)
print(sentence_tokens[:5])
print(word_tokens[:5])

['machine  learning is a rapidly evolving field within artificial intelligence (ai) that focuses on the development of algorithms and statistical models that enable computers to learn from and make predictions or decisions based on data.', 'unlike traditional programming where explicit instructions are given for each task, machine learning allows systems to learn patterns and relationships within data, and to improve their performance over time as they are exposed to more data.', 'this capability is grounded in the idea that systems can automatically improve their performance without human intervention through experience.', 'at its core, machine learning is divided into several categories, with the primary ones being supervised learning, unsupervised learning, and reinforcement learning.', 'each of these categories approaches the task of learning from data in a unique way, depending on the nature of the data and the desired outcome.']
['machine', 'learning', 'is', 'a', 'rapidly']


[nltk_data] Error loading punkt_tab: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>
[nltk_data] Error loading punkt: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>
[nltk_data] Error loading wordnet: <urlopen error [Errno -2] Name or
[nltk_data]     service not known>


In [4]:
#normalize the text by removing punctuation and reducing words to their base or root form (lemmatization)
lemmer = nltk.stem.WordNetLemmatizer()

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) ##This dictionary will be used to remove punctuation from the text.

def lem_tokens(tokens): #takes a list of tokens (words) as input and returns a list of lemmatized tokens.
    return [lemmer.lemmatize(token) for token in tokens]

def lem_normalize(text):#This function, lem_normalize, normalizes the input text by performing  translate(),lower(), tokenize()
    return lem_tokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

# translate() used to modify a string by replacing characters based on a specified translation table
#text.lower() converts all characters in the text to lowercase, ensuring uniformity.
#translate(remove_punct_dict) removes any punctuation from the text using the remove_punct_dict created earlier.
#nltk.word_tokenize() splits the cleaned text into individual words (tokens).
#lem_tokens() is then called to lemmatize each token in the list.

In [5]:
GREETING_INPUTS = ('hello', 'hi', 'greetings', 'sup', 'what\'s up', 'hey',)
GREETING_RESPONSES = ['hi', 'hey', '*nods*', 'hi there', 'hello', 'I am glad! You are talking to me']

def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [6]:
#generates a response to a user's input based on the similarity between the user's query and the chatbot's available data.

GREETING_INPUTS = ('hello', 'hi', 'greetings', 'sup', 'what\'s up', 'hey',)
GREETING_RESPONSES = ['hi', 'hey', '*nods*', 'hi there', 'hello', 'I am glad! You are talking to me']

def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)


from sklearn.feature_extraction.text import TfidfVectorizer #TF-IDF is a statistical measure used to evaluate the importance of a 
#word in a document relative to a collection of documents (corpus). TF-IDF allows the model to focus on the most important 
#words when comparing the user input to the available data.
from sklearn.metrics.pairwise import cosine_similarity #Cosine similarity is widely used in text processing to measure
# the similarity between two documents (or sentences) represented as vectors


def response(user_response):
    robo_response = ''
    
    # Ensure sentence_tokens doesn't contain duplicates
    if user_response not in sentence_tokens:
        sentence_tokens.append(user_response)

    # Check if we have more than one sentence for comparison
    if len(sentence_tokens) > 1:
        # Vectorizing and calculating similarities
        vectorizer = TfidfVectorizer(tokenizer=lem_normalize, stop_words='english')
        tfidf = vectorizer.fit_transform(sentence_tokens)  # TF-IDF matrix for all sentence tokens

        values = cosine_similarity(tfidf[-1], tfidf[:-1])  # Compare the new input against all previous sentences
        idx = values.argsort()[0][-2]  # Get the index of the second most similar sentence
        flat = values.flatten()
        flat.sort()
        req_tfidf = flat[-2]  # The second highest similarity score
        
        if req_tfidf == 0:
            # If similarity is too low, respond with a default message
            robo_response = "Sorry, I don't understand you."
        else:
            # Return the most similar sentence
            robo_response = sentence_tokens[idx]
    else:
        # If only one sentence, return a default response
        robo_response = "Sorry, I don't understand you."

    return robo_response



In [23]:
pip install pyttsx3

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pyttsx3
engine = pyttsx3.init()
# Set the rate (speed) of speech
engine.setProperty('rate', 150)  # 150 words per minute

# Set the volume (0.0 to 1.0)
engine.setProperty('volume', 0.9)  # 90% volume
engine.say("Hello, how can I assist you today?")
engine.runAndWait()

In [None]:
flag = True
print('BOT: My name is Robo, I will answer your questions about Machine Learning. If you want to exit, type Bye')

interactions = [
    'hi',
    'what is Machine Learning?',
    'What are the different types of Machine Learning algorithms?',
    'What is the difference between supervised and unsupervised learning',
    'What is the difference between classification and regression?',
    'machine learning algorithms?',
    'sounds awesome',
    'bye',
]

sentence_tokens = []

while flag:
    user_response = input("User: ")
    user_response = user_response.lower()

    # Prevent adding duplicate questions to sentence_tokens
    if user_response not in sentence_tokens:
        sentence_tokens.append(user_response)

    if user_response != 'bye':
        if user_response == 'thanks' or user_response == 'thank you':
            flag = False
            print('BOT: You are welcome...')
        elif greeting(user_response) != None:
            print('ROBO: {}'.format(greeting(user_response)))
            #engine.say(format(greeting(user_response)))
            #engine.runAndWait()
        else:
            print('ROBO: ', end='')
            print(response(user_response))
            #engine.say(format(response(user_response)))
            #engine.runAndWait()
    else:
        flag = False
        print('BOT: bye!')
