# English Assistant

### 1) Extract audio

In [1]:
import pyaudio
import wave
import threading

# Recording parameters
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
OUTPUT_FILENAME = "recorded_audio.wav"

# Initialize PyAudio
audio = pyaudio.PyAudio()
stream = audio.open(
    format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK
)

frames = []
recording = True


def record():
    global recording
    print("Recording... Press Enter to stop.")
    while recording:
        data = stream.read(CHUNK)
        frames.append(data)


# Start recording in a separate thread
record_thread = threading.Thread(target=record)
record_thread.start()

# Wait for user to press Enter
input()  # Blocks until Enter is pressed
recording = False  # Stop recording

# Wait for the recording thread to finish
record_thread.join()

print("Recording finished.")

# Stop and close the stream
stream.stop_stream()
stream.close()
audio.terminate()

# Save the recorded data to a WAV file
with wave.open(OUTPUT_FILENAME, "wb") as wf:
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(audio.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b"".join(frames))

print(f"Audio saved as {OUTPUT_FILENAME}")

Recording... Press Enter to stop.
Recording finished.
Audio saved as recorded_audio.wav


### 2) Speech-to-text : Whisper

In [2]:
import whisper_timestamped as whisper

audio = whisper.load_audio("recorded_audio.wav")
whisper_model = whisper.load_model("tiny")
transcription = whisper.transcribe(whisper_model, audio, language="en")

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



100%|██████████| 524/524 [00:00<00:00, 2329.44frames/s]


In [3]:
words_with_confidence = []

for segment in transcription["segments"]:
    for word in segment["words"]:
        words_with_confidence.append(
            {
                "word": word["text"],
                "confidence": word["confidence"],
                "is_low_confidence": word["confidence"] < 0.5,
            }
        )


words_with_confidence

[{'word': 'Hello,', 'confidence': 0.775, 'is_low_confidence': False},
 {'word': 'my', 'confidence': 0.947, 'is_low_confidence': False},
 {'word': 'name', 'confidence': 0.998, 'is_low_confidence': False},
 {'word': 'is', 'confidence': 0.997, 'is_low_confidence': False},
 {'word': 'Louis', 'confidence': 0.54, 'is_low_confidence': False},
 {'word': 'and', 'confidence': 0.637, 'is_low_confidence': False},
 {'word': 'I', 'confidence': 0.961, 'is_low_confidence': False},
 {'word': 'want', 'confidence': 0.906, 'is_low_confidence': False},
 {'word': 'to', 'confidence': 0.995, 'is_low_confidence': False},
 {'word': 'purchase', 'confidence': 0.473, 'is_low_confidence': True},
 {'word': 'my', 'confidence': 0.963, 'is_low_confidence': False},
 {'word': 'English.', 'confidence': 0.928, 'is_low_confidence': False}]

In [4]:
transcription["text"]

' Hello, my name is Louis and I want to purchase my English.'

### 3) Text-to-text: Llama 3.2

In [5]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    token=os.getenv("HF_TOKEN"),
).to(device)

Using device: mps


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
system_prompt = "You are a helpful English language tutor."

prompt = f"""<|system|>
{system_prompt}
<|user|>
The student said: "{transcription['text']}"
Respond in a helpful, encouraging way.
<|assistant|>"""

inputs = llama_tokenizer(prompt, return_tensors="pt").to(device)
output = llama_model.generate(
    **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True
)

response_text = llama_tokenizer.decode(output[0], skip_special_tokens=True)

# Extract assistant's response
response_text = response_text.split("<|assistant|>")[-1].strip()

response_text

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


"Hi Louis! It's great to meet you. I'd be happy to help you with your English purchase. Can you tell me a bit more about what you're looking for? Are you looking to buy a book, a language course, or perhaps a conversation practice session? Let's chat and see how I can assist you in achieving your English goals!"

### 4) Text-to-speech: Kokoro

In [7]:
from kokoro import KPipeline
import numpy as np
from IPython.display import display, Audio

kokoro_pipeline = KPipeline(lang_code="a")

generator = kokoro_pipeline(
    response_text, voice="af_heart", speed=1, split_pattern=r"\n+"
)

speech_segments = []
for _, _, audio in generator:
    speech_segments.append(audio)

speech_output = (
    np.concatenate(speech_segments) if len(speech_segments) > 1 else speech_segments[0]
)

display(Audio(data=speech_output, rate=24000))



  WeightNorm.apply(module, name, dim)
