In [None]:
from deeppavlov import configs, build_model

# Load pre-trained sentiment analysis model
model_config = configs.classifiers.rusentiment_bert

# Load the model
model = build_model(model_config, download=True)

# Define some example texts
texts = [
    "I love this movie, it's fantastic!",
    "This book is boring, I didn't enjoy it.",
    "The food at that restaurant was delicious."
]

# Perform sentiment analysis on each text
for text in texts:
    predictions = model([text])
    sentiment = predictions[0]  # Assuming sentiment is the first element of predictions
    print(f"Text: {text}\nSentiment: {sentiment}\n")


In [None]:
from deeppavlov import build_model
from nltk.tokenize import sent_tokenize
import os

# Load the pre-trained DeepPavlov model for sentence embeddings
model = build_model("universal_sentence_encoder", download=True)

# Function to preprocess the textbook data
def preprocess_textbook(textbook_path):
    with open(textbook_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return sent_tokenize(text)

# Function to find the most similar response
def find_most_similar_response(user_input, responses):
    user_embedding = model([user_input])
    response_embeddings = model(responses)
    similarities = [(i, cosine_similarity(user_embedding, resp_emb)) for i, resp_emb in enumerate(response_embeddings)]
    max_similarity_idx = max(similarities, key=lambda x: x[1])[0]
    return responses[max_similarity_idx]

def cosine_similarity(embedding1, embedding2):
    return sum(a * b for a, b in zip(embedding1, embedding2)) / (sum(a ** 2 for a in embedding1) ** 0.5 * sum(b ** 2 for b in embedding2) ** 0.5)

if __name__ == "__main__":
    textbook_path = "corpus_data/movie.txt"  # Path to your textbook file
    if not os.path.exists(textbook_path):
        print("Textbook file not found. Please provide the correct path.")
        exit(1)

    # Preprocess the textbook data
    textbook_responses = preprocess_textbook(textbook_path)

    print("Welcome to the Textbook Chatbot! Type 'quit' to end the conversation.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            print("Goodbye!")
            break
        
        # Find the most similar response
        response = find_most_similar_response(user_input, textbook_responses)
        print("Bot:", response)


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained BERT model for sequence classification
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Define some example texts
texts = [
    "I love this movie, it's fantastic!",
    "This book is boring, I didn't enjoy it.",
    "The food at that restaurant was delicious."
]

# Perform sentiment analysis on each text
for text in texts:
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax().item()
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    print(f"Text: {text}\nSentiment: {sentiment}\n")


In [None]:
from deeppavlov import build_model, configs

# Load a pre-trained retrieval-based model
model_config = configs.seq2seq.seq2seq_encoder_decoder  # This is an example, adjust based on your findings

model = build_model(model_config, download=True)

# Define greetings and small talk responses
greetings = ["hello", "hi", "hey there"]
small_talk = {
    "how are you": ["I'm doing well, thanks for asking! How about you?"],
    "what's up": ["Not much, just hanging out. What are you up to?"],
    "good morning": ["Good morning! ☀️"],
    "good evening": ["Good evening! "],
}

def chat_with_bot(user_input):
  """Handles user input and generates responses."""
  if user_input.lower() in greetings:
    return f"Hi!  Nice to hear from you."
  elif user_input.lower() in small_talk:
    return random.choice(small_talk[user_input.lower()])
  else:
    # Use the model for retrieval-based response generation
    bot_response = model([user_input])[0]
    return bot_response  # Replace with retrieved text for more complex responses

# Main loop for user interaction
while True:
  user_input = input("You: ")
  if user_input.lower() in ['exit', 'quit']:
    print("Goodbye!")
    break
  bot_response = chat_with_bot(user_input)
  print("Bot:", bot_response)


In [None]:

from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")


model = GPT4All(model_name='orca-mini-3b-gguf2-q4_0.gguf')
with model.chat_session():
    response1 = model.generate(prompt='hello', temp=0)
    response2 = model.generate(prompt='write me a short poem', temp=0)
    response3 = model.generate(prompt='thank you', temp=0)
    print(model.current_chat_session)

In [None]:
from gpt4all import GPT4All
model = GPT4All("orca-mini-3b-gguf2-q4_0.gguf")
tokens = []
with model.chat_session():
    for token in model.generate("What is the capital of France?", streaming=True):
        tokens.append(token)
print(tokens)


In [None]:
with model.chat_session():
    resp = model.generate("Write a select Query")
print(resp)    

In [None]:
resp = model.generate("Write a select Query")
print(resp)

In [None]:
import transformers
from transformers import pipeline

# Download and load the GPT-4All model (might take some time)
model_name = "EleutherAI/gpt-j-6B"  # Adjust model name based on your setup
generator = pipeline("text-generation", model=model_name)

def chat():
  """
  Interactive chat loop with the GPT-4All model.
  """
  while True:
    user_input = input("You: ")
    if user_input.lower() == "quit":
      break
    
    # Generate response using GPT-4All
    response = generator(user_input, max_length=100, do_sample=True, top_k=50, top_p=0.9)  # Adjust parameters as needed
    print("Bot:", response[0]['generated_text'])

if __name__ == "__main__":
  print("Welcome to the GPT-4All Chatbot!")
  chat()

In [None]:
from gpt4all import GPT4All

model_name = "orca-mini-3b-gguf2-q4_0.gguf"
model = GPT4All(model_name=model_name)

def chat():
    """
    Interactive chat loop with the GPT4All model.
    """
    print("Welcome to the GPT4All Chatbot!")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            print("Goodbye!")
            break
        
        # Generate response using GPT4All
        response = model.generate(prompt=user_input, temp=0)
        print("Bot:", response)

if __name__ == "__main__":
    chat()


In [None]:
from gpt4all import GPT4All
model = GPT4All('wizardlm-13b-v1.2.Q4_0.gguf')
system_template = 'A chat between a curious user and an artificial intelligence assistant.\n'
# many models use triple hash '###' for keywords, Vicunas are simpler:
prompt_template = 'USER: {0}\nASSISTANT: '
with model.chat_session(system_template, prompt_template):
    response1 = model.generate('why is the grass green?')
    print(response1)
    print()
    response2 = model.generate('why is the sky blue?')
    print(response2)

In [None]:
import nltk
from nltk.chat.util import Chat, reflections

# Define pairs of patterns and responses for the chatbot
pairs = [
    [
        r"my name is (.*)",
        ["Hello %1, how can I help you today?",]
    ],
    [
        r"what is your name?",
        ["My name is ChatBot and I'm here to assist you.",]
    ],
    [
        r"how are you ?",
        ["I'm doing well, thank you!", "I'm fine, thanks!",]
    ],
    [
        r"quit",
        ["Bye, take care. See you soon!", "Goodbye! Have a great day!",]
    ],
]

# Create a Chat instance with the defined pairs
chatbot = Chat(pairs, reflections)

def chat():
    """
    Interactive chat loop with the NLTK chatbot.
    """
    print("Welcome! Type 'quit' to end the conversation.")
    while True:
        user_input = input("You: ")
        response = chatbot.respond(user_input)
        print("Bot:", response)
        if user_input.lower() == "quit":
            break

if __name__ == "__main__":
    chat()


In [1]:
import os

# Directory containing the .txt files
directory = 'corpus_data/'

# List all files in the directory
files = os.listdir(directory)

# Filter out only the .txt files
txt_files = [file for file in files if file.endswith('.txt')]

# Loop through each .txt file
for txt_file in txt_files:
    # Construct the full path to the file
    file_path = os.path.join(directory, txt_file)
    
    # Open the file and read its contents
    with open(file_path, 'r') as file:
        data = file.read()
    
    # Process the data as needed
    print(f"Contents of {txt_file}:")
    print(data)
    print()


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.chat.util import Chat, reflections

# Sample text data (you can replace this with your own text dataset)
text_data = data

# Preprocess the text data
stop_words = set(stopwords.words('english'))
sentences = nltk.sent_tokenize(text_data)
word_tokens = [word_tokenize(sentence.lower()) for sentence in sentences]
filtered_sentences = [[word for word in words if word.isalnum() and word not in stop_words] for words in word_tokens]

# Create a dictionary where each word is a key and its value is a list of sentences containing that word
word_to_sentences = {}
for sentence in filtered_sentences:
    for word in sentence:
        if word not in word_to_sentences:
            word_to_sentences[word] = []
        word_to_sentences[word].append(sentence)

# Define a function to find the most similar sentence based on word overlap
def respond(input_sentence):
    words = [word.lower() for word in word_tokenize(input_sentence) if word.isalnum() and word not in stop_words]
    response_sentences = []
    for word in words:
        if word in word_to_sentences:
            response_sentences.extend(word_to_sentences[word])
    if response_sentences:
        return ' '.join(random.choice(response_sentences))
    else:
        return "I'm sorry, I don't understand."

# Example interaction loop
print("Welcome! Type 'quit' to end the conversation.")
while True:
    user_input = input("You: ")
    if user_input.lower() == "quit":
        print("Goodbye!")
        break
    else:
        response = respond(user_input)
        print("Bot:", response)


In [None]:
import spacy
import random

# Load English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")

# Read and preprocess text from the textbook
with open("corpus_data/movie.txt", "r") as file:
    text = file.read()

# Train the SpaCy model on the preprocessed text data
# Increase the maximum length to accommodate larger text
nlp.max_length = len(text) + 1000000  # Adding extra buffer
doc = nlp(text)

# Function to generate a response based on user input
def generate_response(user_input):
    user_doc = nlp(user_input)
    # Here you can implement logic to process the user input and generate a response
    # For simplicity, let's just select a random sentence from the textbook
    sentences = [sent.text for sent in doc.sents]
    return random.choice(sentences)

if __name__ == "__main__":
    print("Welcome to the Textbook Chatbot! Type 'quit' to end the conversation.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "quit":
            print("Goodbye!")
            break
        response = generate_response(user_input)
        print("Bot:", response)


To create a chatbot trained on top of your textbook, you can use various machine learning approaches, including both retrieval-based and generative models. Here are some options:

1. **Retrieval-Based Models**:
    - **TF-IDF Vectorization**: You can represent each document (e.g., sentence or paragraph) in your textbook as a TF-IDF vector. When a user inputs a query, you can calculate the similarity between the query and each document vector to retrieve the most similar document as the response.
    - **Word Embeddings + Cosine Similarity**: Instead of TF-IDF, you can use word embeddings (e.g., Word2Vec, GloVe) to represent documents and queries as dense vectors. Then, calculate the cosine similarity between the query and each document vector to retrieve the most similar document.
    - **Pre-trained Language Models**: Use pre-trained language models like BERT or Universal Sentence Encoder to encode the input query and documents, then calculate similarity scores to retrieve the most relevant document.

2. **Generative Models**:
    - **Seq2Seq Models**: Train a sequence-to-sequence model (e.g., using LSTM or Transformer architecture) on pairs of input-output sequences. Each input can be a user query, and the corresponding output is the response from your textbook.
    - **GPT-Based Models**: Fine-tune a GPT model (e.g., GPT-3, GPT-4) on your textbook data. Although GPT models are primarily used for generating text, you can fine-tune them on a dataset to learn the specific style and content of your textbook.

3. **Hybrid Approaches**:
    - **Retrieval + Generative**: Combine retrieval-based and generative approaches. Use a retrieval-based model to retrieve a set of candidate responses based on the user query, then use a generative model to select or refine the final response from the candidates.
    - **Rule-Based Filtering**: Pre-process the user query to identify specific topics or intents and filter the search space to only relevant sections of your textbook. Then, apply a retrieval or generative model to generate responses from the filtered content.

Each approach has its advantages and limitations, and the choice depends on factors like the size of your dataset, computational resources, desired level of customization, and the specific requirements of your chatbot.


**LSTM (Long Short-Term Memory)**:
LSTM model is a type of recurrent neural network (RNN) architecture designed to model sequential data and overcome the vanishing gradient problem that traditional RNNs face. 

LSTM networks are capable of learning long-term dependencies in data by maintaining a cell state, which can be updated and modified through carefully designed gates. These gates regulate the flow of information, allowing LSTM networks to capture dependencies over long time lags and hence make them effective for tasks like natural language processing, time series prediction, and speech recognition.

In [5]:
import numpy as np
import tensorflow as tf

# Sample training data
input_data = ['hello', 'how are you', 'goodbye']
output_data = ['hi', 'I am fine', 'bye']

# Tokenization
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
output_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
input_tokenizer.fit_on_texts(input_data)
output_tokenizer.fit_on_texts(output_data)

input_seq = input_tokenizer.texts_to_sequences(input_data)
output_seq = output_tokenizer.texts_to_sequences(output_data)

input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq)
output_seq = tf.keras.preprocessing.sequence.pad_sequences(output_seq)

# Define the Seq2Seq model
class Seq2SeqModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(Seq2SeqModel, self).__init__()
        self.encoder = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True)
        self.decoder = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True)
        self.dense = tf.keras.layers.Dense(vocab_size)
    
    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs
        _, encoder_state_h, encoder_state_c = self.encoder(encoder_inputs)
        encoder_states = [encoder_state_h, encoder_state_c]
        decoder_outputs, _, _ = self.decoder(decoder_inputs, initial_state=encoder_states)
        logits = self.dense(decoder_outputs)
        return logits

# Initialize and compile the model
vocab_size = len(input_tokenizer.word_index) + 1
embedding_dim = 128
hidden_units = 256

model = Seq2SeqModel(vocab_size, embedding_dim, hidden_units)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([input_seq, output_seq[:, :-1]], output_seq[:, 1:], batch_size=2, epochs=100)


Epoch 1/100


ValueError: in user code:

    File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\pedam\AppData\Local\Temp\__autograph_generated_filec2idwjx4.py", line 11, in tf__call
        (_, encoder_state_h, encoder_state_c) = ag__.converted_call(ag__.ld(self).encoder, (ag__.ld(encoder_inputs),), None, fscope)

    ValueError: Exception encountered when calling layer "seq2_seq_model_3" "                 f"(type Seq2SeqModel).
    
    in user code:
    
        File "C:\Users\pedam\AppData\Local\Temp\ipykernel_20488\3889586601.py", line 30, in call  *
            _, encoder_state_h, encoder_state_c = self.encoder(encoder_inputs)
        File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\layers\rnn\base_rnn.py", line 553, in __call__  **
            return super().__call__(inputs, **kwargs)
        File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
            raise e.with_traceback(filtered_tb) from None
        File "C:\ProgramData\anaconda3\envs\deeppavlov_env\lib\site-packages\keras\engine\input_spec.py", line 232, in assert_input_compatibility
            raise ValueError(
    
        ValueError: Input 0 of layer "lstm_6" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 3)
    
    
    Call arguments received by layer "seq2_seq_model_3" "                 f"(type Seq2SeqModel):
      • inputs=('tf.Tensor(shape=(None, 3), dtype=int32)', 'tf.Tensor(shape=(None, 2), dtype=int32)')


In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Masking, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample training data
input_data = ['hello', 'how are you', 'goodbye']
output_data = ['hi', 'I am fine', 'bye']

# Tokenization
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
output_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
input_tokenizer.fit_on_texts(input_data)
output_tokenizer.fit_on_texts(output_data)

input_seq = input_tokenizer.texts_to_sequences(input_data)
output_seq = output_tokenizer.texts_to_sequences(output_data)

input_seq = pad_sequences(input_seq)
output_seq = pad_sequences(output_seq)

# Define the Transformer model
def transformer_model(vocab_size, d_model, num_heads, num_layers, dropout_rate, name="transformer"):
    inputs = Input(shape=(None,), name="inputs")
    dec_inputs = Input(shape=(None,), name="dec_inputs")

    # Embedding layers
    input_embedding = Embedding(vocab_size, d_model)(inputs)
    dec_embedding = Embedding(vocab_size, d_model)(dec_inputs)

    # Masking layers
    input_masking = Masking(mask_value=0)(input_embedding)
    dec_masking = Masking(mask_value=0)(dec_embedding)

    # Encoder
    enc_outputs = input_masking
    for i in range(num_layers):
        enc_outputs = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)(enc_outputs, enc_outputs)
        enc_outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(enc_outputs)

    # Decoder
    dec_outputs = dec_masking
    for i in range(num_layers):
        dec_outputs = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)(dec_outputs, enc_outputs)
        dec_outputs = tf.keras.layers.LayerNormalization(epsilon=1e-6)(dec_outputs)

    # Output layer
    outputs = Dense(vocab_size)(dec_outputs)

    return Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)

# Initialize and compile the model
vocab_size = len(input_tokenizer.word_index) + 1
d_model = 128
num_heads = 4
num_layers = 2
dropout_rate = 0.1

model = transformer_model(vocab_size, d_model, num_heads, num_layers, dropout_rate)
model.compile(optimizer=Adam(1e-4), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=[SparseCategoricalAccuracy()])

# Train the model
model.fit([input_seq, output_seq[:, :-1]], output_seq[:, 1:], batch_size=2, epochs=100)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x17e8b2b0b20>

In [10]:
def preprocess_input(text, tokenizer, max_length):
    """
    Preprocesses the input text.
    """
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    return padded_sequence

def generate_response(input_text, model, tokenizer, max_length):
    """
    Generates a response to the input text using the trained model.
    """
    input_sequence = preprocess_input(input_text, tokenizer, max_length)
    output_sequence = np.array([[0]])  # Start token
    for _ in range(max_length):
        predictions = model.predict([input_sequence, output_sequence])
        predicted_id = np.argmax(predictions[:, -1, :])
        if predicted_id == 0:  # End token
            break
        output_sequence = np.append(output_sequence, [[predicted_id]], axis=-1)
    return output_sequence

def postprocess_output(output_sequence, tokenizer):
    """
    Postprocesses the model's output sequence into human-readable text.
    """
    decoded_text = tokenizer.sequences_to_texts(output_sequence)[0]
    return decoded_text.strip()

# Example usage
input_text = "hello"
max_length = 10000
response_sequence = generate_response(input_text, model, output_tokenizer, max_length)
response_text = postprocess_output(response_sequence, output_tokenizer)
print("Bot:", response_text)


Bot: 
