In [None]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.5 MB/s eta 0:00:00
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.8 nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import pickle
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
faq_data = pd.read_csv('scraped_faq_dataset.csv')

In [None]:
faq_data.shape

(20000, 2)

In [None]:
faq_data.head()

Unnamed: 0,question,answer
0,How can I start learning about deep learning?,deep learning is essential for enhancing techn...
1,What tools are used in machine learning?,machine learning is crucial for improving effi...
2,What is the purpose of virtual reality?,The purpose of virtual reality is to enhancing...
3,What is the difference between game developmen...,game development is essential for enhancing te...
4,How does data science work?,data science is different from enhancing techn...


In [None]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

faq_data['Processed_Question'] = faq_data['question'].apply(preprocess_text)

In [None]:
X = faq_data['Processed_Question']
y = faq_data['answer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(16000,)
(4000,)
(16000,)
(4000,)


In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000, min_df=2)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
print(X_train_vectorized[6])

  (0, 1131)	0.3365795896153697
  (0, 666)	0.3365795896153697
  (0, 1132)	0.3365795896153697
  (0, 1353)	0.28091073210974027
  (0, 704)	0.5390879799015267
  (0, 1154)	0.5390879799015267


In [None]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vectorized, y_train)

In [None]:
predictions = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, predictions, zero_division=1))

Model Accuracy: 0.10
Classification Report:
                                                                                                                        precision    recall  f1-score   support

                                                        Internet of Things helps in enhancing technological solutions.       1.00      0.00      0.00        25
                                      Internet of Things is a field that focuses on enhancing technological solutions.       0.10      0.11      0.10        19
                                         Internet of Things is crucial for improving efficiency in various industries.       0.00      0.00      0.00        18
         Internet of Things is different from enhancing technological solutions because they serve different purposes.       0.07      0.08      0.07        13
                                                Internet of Things is essential for enhancing technological solutions.       0.09      0.43      0.15      

In [None]:
with open('faq_vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
with open('faq_model.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file)


In [None]:
def get_response(user_input):
    user_input_processed = user_input
    user_input_vectorized = vectorizer.transform([user_input_processed])
    response = classifier.predict(user_input_vectorized)[0]
    return response

if __name__ == '__main__':
    print("Chatbot is ready! Type your question below:")
    while True:
        user_input = input("You: ").strip()
        if user_input.lower() in ['exit', 'quit']:
            print("Chatbot: Goodbye!")
            break
        response = get_response(user_input)
        print(f"Chatbot: {response}")


Chatbot is ready! Type your question below:
You: web development
Chatbot: You can start learning web development by starting with basic concepts.


KeyboardInterrupt: Interrupted by user