In [1]:
import os
import io
import json
import time
import requests
import collections
import queue
import numpy as np
import webrtcvad
import sounddevice as sd
from faster_whisper import WhisperModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torchaudio


os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import threading

In [2]:
class SentimentAnalyzer:
    def __init__(self):
        self.model_name = "distilbert-base-uncased-finetuned-sst-2-english"
        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_name)
        self.model = DistilBertForSequenceClassification.from_pretrained(self.model_name)
        self.model.eval()  # Set to evaluation mode
    
    def analyze(self, text):
        if not text.strip():
            return {"sentiment": "neutral", "confidence": 0.5, "score": -1}
            
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        sentiment = torch.argmax(probs, dim=1).item()  # 0 = negative, 1 = positive
        confidence = probs[0][sentiment].item()
        
        return {
            "sentiment": "positive" if sentiment == 1 else "negative",
            "confidence": confidence,
            "score": sentiment
        }

In [3]:
def record_utterance(sample_rate=16000, frame_duration=30):
    vad = webrtcvad.Vad(3) 
    frame_size = int(sample_rate * frame_duration / 1000)
    audio_q = queue.Queue()
    
    def audio_callback(indata, frames, time_info, status):
        audio_q.put(bytes(indata))
    
    with sd.RawInputStream(samplerate=sample_rate, blocksize=frame_size,
                         dtype='int16', channels=1, callback=audio_callback):
        ring_buffer = collections.deque(maxlen=5)  
        voiced_frames = []
        speech_detected = False
        silence_duration = 0
        required_silence = 1.0 
        
        while True:
            frame = audio_q.get()
            is_speech = vad.is_speech(frame, sample_rate)
            
            if is_speech:
                if not speech_detected:
                    print("...", flush=True)
                    speech_detected = True
                    voiced_frames.extend(ring_buffer) 
                voiced_frames.append(frame)
                silence_duration = 0
            else:
                if speech_detected:
                    silence_duration += frame_duration / 4000.0
                    voiced_frames.append(frame) 
                    
                    if silence_duration >= required_silence:
                        return b''.join(voiced_frames)
                else:
                    ring_buffer.append(frame)

In [4]:
def transcribe_audio(model, audio_bytes):
    audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
    waveform = torch.from_numpy(audio_np.astype(np.float32) / 32768.0).unsqueeze(0)

    buffer = io.BytesIO()
    torchaudio.save(buffer, waveform, 16000, format='wav')
    buffer.seek(0)

    segments, _ = model.transcribe(buffer)
    text = " ".join([seg.text for seg in segments]).strip()
    print(f"You: {text}")
    return text

In [5]:
def init_tts():
    tts_model = torch.hub.load('snakers4/silero-models', 'silero_tts', 
                             language='en', speaker='v3_en')[0]
    return tts_model

def speak(text, tts_model, speaker='en_99', sample_rate=24000):
    try:
        audio = tts_model.apply_tts(text=text, 
                                  speaker=speaker, 
                                  sample_rate=sample_rate)
        sd.play(audio, sample_rate)
        sd.wait()
    except Exception as e:
        print(f"TTS Error: {e}")

In [6]:
class ConversationManager:
    def __init__(self):
        self.context = []
        self.last_user_message = ""
        self.sentiment_analyzer = SentimentAnalyzer()
        self.conversation_topics = {
            "default": {
                "positive": [
                    "That's wonderful! What's making you so happy today?",
                    "You sound really pleased about that!",
                    "Great to hear you're in good spirits!"
                ],
                "negative": [
                    "I'm sorry to hear that. Would you like to talk about it?",
                    "That sounds difficult. How are you handling it?",
                    "I can tell this is bothering you. What would help?"
                ],
                "neutral": [
                    "What would you like to talk about?",
                    "What's on your mind?",
                    "Tell me more about that."
                ]
            },
            "tech": {
                "positive": [
                    "Glad you're excited about tech! What specifically interests you?",
                    "Tech can be so rewarding when it works well!",
                    "What tech has caught your attention lately?"
                ],
                "negative": [
                    "Tech frustrations are so common these days. What's the issue?",
                    "Having computer problems again?",
                    "What's not working the way it should?"
                ],
                "neutral": [
                    "What tech topics interest you these days?",
                    "How's your tech life going?",
                    "Working on anything interesting with technology?"
                ]
            },
            "personal": {
                "positive": [
                    "You sound really happy about that!",
                    "It's great to hear good news!",
                    "What's bringing you joy these days?"
                ],
                "negative": [
                    "I'm here to listen if you want to talk.",
                    "That sounds challenging. How are you coping?",
                    "Would it help to talk about what's bothering you?"
                ],
                "neutral": [
                    "How are things going for you?",
                    "What's new in your life?",
                    "How have you been lately?"
                ]
            }
        }
        self.occasional_quips = [
            "That's what I would say if I had more feelings.",
            "Fascinating - tell me more.",
            "Interesting perspective!",
            "You always have such unique thoughts.",
            "That's a new one for my conversation database."
        ]
    
    def analyze_sentiment(self, text):
        return self.sentiment_analyzer.analyze(text)
    
    def update_context(self, user_input):
        sentiment = self.analyze_sentiment(user_input)
        self.last_user_message = user_input
        self.context.append(("user", user_input, sentiment))
        if len(self.context) > 5:
            self.context.pop(0)
    
    def should_add_quip(self):
        # Only 10% chance to add a quip, and only if the conversation is light
        if (self.context and 
            self.context[-1][2]["sentiment"] == "positive" and 
            len(self.context) > 2 and 
            np.random.random() < 0.1):
            return True
        return False
    
    def generate_follow_up(self):
        if not self.context:
            return np.random.choice(self.conversation_topics["default"]["neutral"])
        
        last_interaction = self.context[-1]
        sentiment = last_interaction[2]["sentiment"]
        
        topic = "default"
        if any(word in self.last_user_message.lower() for word in ["computer", "tech", "code", "programming"]):
            topic = "tech"
        elif any(word in self.last_user_message.lower() for word in ["i", "me", "my", "feel", "feeling"]):
            topic = "personal"
        
        response = np.random.choice(self.conversation_topics[topic][sentiment])
        
        if self.should_add_quip():
            quip = np.random.choice(self.occasional_quips)
            response = f"{response} {quip}"
        
        return response
    
    def format_prompt(self, user_input):
        self.update_context(user_input)
        sentiment = self.context[-1][2]
        
        history = "\n".join([f"{who}: {msg}" for who, msg, _ in self.context[-2:]])
        
        return f"""Continue this conversation naturally as a friendly, helpful AI assistant. 
        User sentiment: {sentiment['sentiment']} (confidence: {sentiment['confidence']:.2f}).
        
        Guidelines:
        1. Be warm and personable but not overly familiar
        2. Use natural spoken language that sounds good when read aloud
        3. Only use humor when it feels completely natural (about 10% of responses)
        4. Keep responses concise (1-2 sentences)
        5. Match the user's emotional tone:
           - Positive: supportive and engaged
           - Negative: empathetic and helpful
           - Neutral: curious and interested
        
        Example good responses:
        - "That's great progress! What's next?"
        - "I understand why that would be frustrating."
        - "What aspects of that interest you most?"
        
        Avoid:
        - Forced humor or jokes
        - Overly casual language when inappropriate
        - Any physical action descriptions
        
        Conversation history:
        {history}
        
        Respond to: {user_input}
        """

In [7]:
def generate_response(prompt, tts_model, conversation):
    try:
        formatted_prompt = conversation.format_prompt(prompt)
        
        response = requests.post(
            "http://localhost:11434/api/generate",
            headers={"Content-Type": "application/json"},
            data=json.dumps({
                "model": "mistral",
                "prompt": formatted_prompt,
                "stream": True,
                "options": {
                    "temperature": 0.85,
                    "max_tokens": 120,
                }
            }),
            stream=True
        )
        
        response.raise_for_status()
        
        full_response = ""
        buffer = ""
        sentence_end_chars = {'.', '?', '!', ',', ';', ':'}
        
        for line in response.iter_lines():
            if line:
                chunk = json.loads(line.decode('utf-8'))
                token = chunk.get("response", "")
                
                full_response += token
                buffer += token

                if token in sentence_end_chars or token.isspace():
                    if buffer.strip():
                        speak(buffer, tts_model, speaker='en_99')
                        buffer = ""
        
        if buffer.strip():
            speak(buffer, tts_model, speaker='en_99')
        
        print(f"AI: {full_response.strip()}")
        conversation.context.append(("assistant", full_response.strip(), {"sentiment": "neutral"}))
        
        return full_response.strip()
    
    except Exception as e:
        print(f"Error: {e}")
        error_msg = "Hmm, I got a bit confused there. What were we saying?"
        speak(error_msg, tts_model)
        return error_msg

In [9]:
tts_model = init_tts()
stt_model = WhisperModel("tiny.en", device="cuda", compute_type="int8_float32")
conversation = ConversationManager()

speak("Hello, what do you have in mind today...?", tts_model)

try:
    while True:
        audio = record_utterance()
        if not audio:
            continue
            
        user_text = transcribe_audio(stt_model, audio)
        if not user_text:
            continue
            
        bot_response = generate_response(user_text, tts_model, conversation)

except KeyboardInterrupt:
    speak("Goodbye!", tts_model)

Using cache found in C:\Users\Soumyak/.cache\torch\hub\snakers4_silero-models_master


...
You: I have a ceiling no couldo, so you can help me.
