<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/NLP-Projects/blob/main/Simple-TFIDF-ChatBot/Ruled_Based_ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import Libs

In [1]:
import os
import yaml
import pandas as pd
import numpy as np

# 2. Download Dataset

In [None]:
%%bash
git clone https://github.com/gunthercox/chatterbot-corpus.git

Cloning into 'chatterbot-corpus'...


In [None]:
# Load dataset
PATH = r"/content/chatterbot-corpus/chatterbot_corpus/data/english"
files = [f for f in os.listdir(PATH) if f.endswith('.yml')]

## 2.1 Joining Data

In [None]:
conversations = []
labels = []

for file in files:
    with open(os.path.join(PATH, file), "r", encoding='utf-8') as f:
        data = yaml.safe_load(f)
        if 'conversations' in data:
            for conv in data['conversations']:
                if isinstance(conv, list) and len(conv) >= 2:
                    conversations.append(conv[0])
                    labels.append(conv[1])

# 3. Preprocessing

In [None]:
%%bash
# pip install contractions

In [None]:
import os
import re
import nltk
import spacy
from nltk.corpus import stopwords
import contractions

In [None]:
%%bash
python -m spacy download en_core_web_lg
python -m spacy download en_core_web_md

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_md")

stop_words = set(stopwords.words('english'))

def expand_contractions(text):
    return contractions.fix(text)

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words])

preprocessed_conversations = [preprocess(text) for text in conversations]
preprocessed_labels = [preprocess(text) for text in labels]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 3.1 Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

Q = vectorizer.fit_transform(preprocessed_conversations)
A = vectorizer.transform(labels)

In [None]:
def get_response(user_input):
    user_input = preprocess(user_input)
    user_input_tfidf = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_input_tfidf, A)
    best_match = similarities.argmax()
    return labels[best_match]

In [None]:
inp = 'Hello'
res = get_response(inp)
print(res)

Hello


In [None]:
inp = 'How are you'
res = get_response(inp)
print(res)

to me that's a great compliment.


In [None]:
inp = 'what is AI'
res = get_response(inp)
print(res)

AI is the field of science which concerns itself with building hardware and software that replicates the functions of the human mind.


In [None]:
inp = 'are you investing in stocks?'
res = get_response(inp)
print(res)

my lawyer said i shouldn't give stock tips online.
