In [None]:
!pip install pandas numpy SpeechRecognition pydub gtts
!apt-get install -y ffmpeg


In [None]:
import os
import pandas as pd
import numpy as np
import speech_recognition as sr
from pydub import AudioSegment
from google.colab import drive
import io
from google.colab import files
from gtts import gTTS
from IPython.display import Audio, display
import re


drive.mount('/content/drive')
dataset_path = '/content/drive/My Drive/Colab Notebooks/DataScience QA.csv'
df = pd.read_csv(dataset_path)

def convert_to_wav(uploaded_file_path, output_file_path):
    audio = AudioSegment.from_file(uploaded_file_path)
    audio.export(output_file_path, format="wav")

def speech_to_text():
    recognizer = sr.Recognizer()
    print("Please upload your audio file.")
    uploaded = files.upload()
    uploaded_file_path = list(uploaded.keys())[0]
    wav_file_path = "output.wav"

    try:
        convert_to_wav(uploaded_file_path, wav_file_path)
        with sr.AudioFile(wav_file_path) as source:
            audio = recognizer.record(source)
            try:
                text = recognizer.recognize_google(audio)
                print(f"You said: {text}")
                return text
            except sr.UnknownValueError:
                print("Google Speech Recognition could not understand audio")
            except sr.RequestError as e:
                print(f"Could not request results from Google Speech Recognition service; {e}")
    except Exception as e:
        print(f"Error converting audio file: {e}")
    return None

def text_to_speech(text):
    tts = gTTS(text)
    tts.save("response.mp3")
    display(Audio("response.mp3", autoplay=True))

def jaccard_similarity(query, document):
    query_set = set(query.lower().split())
    document_set = set(document.lower().split())
    intersection = query_set.intersection(document_set)
    union = query_set.union(document_set)
    return float(len(intersection)) / len(union)

def preprocess_text(text):
    text = text.replace('-', ' ')
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    return text

df['Question'] = df['Question'].apply(preprocess_text)
def find_best_match(question, df):
    question = preprocess_text(question)
    similarities = []
    for index, row in df.iterrows():
        cleaned_question = row['Question']
        similarity = jaccard_similarity(question, cleaned_question)
        similarities.append(similarity)
    best_match_index = np.argmax(similarities)
    best_answer = df.iloc[best_match_index]['Answer']
    return best_answer

def chatbot():
    print("Chatbot is ready to talk! (Say 'exit' to stop)")
    while True:
        question = speech_to_text()
        if question is not None:
            if question.lower() == 'exit':
                print("Goodbye!")
                text_to_speech("Goodbye!")
                break
            answer = find_best_match(question, df)
            print(f"Chatbot: {answer}")
            text_to_speech(answer)
        else:
            text_to_speech("Sorry, I didn't catch that. Could you please repeat?")

chatbot()
