**Installing libraries**

In [None]:
!pip install SpeechRecognition
!pip install pydub
!pip install ffmpeg
!pip install pocketsphinx

import speech_recognition as sr
from pydub import AudioSegment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.10.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for f

In [None]:
import librosa
import numpy as np
from sklearn.linear_model import LinearRegression

**Training the LPC model**

In [None]:
import librosa
import numpy as np
from sklearn.linear_model import LinearRegression

# Load audio file
audio_file = "/content/M01_Session1_0005.wav"
audio_signal, sr = librosa.load(audio_file)

# Extract LPC coefficients
p = 20
lpc_coeffs = librosa.lpc(audio_signal, order=p)

In [None]:
# Scale the LPC coefficients
lpc_coeffs /= np.max(np.abs(lpc_coeffs))

# Extract MFCC features
n_mfcc = 13
power_spec = np.abs(librosa.stft(audio_signal))**2
mfcc = librosa.feature.mfcc(S=librosa.power_to_db(power_spec / np.max(power_spec)), n_mfcc=n_mfcc)

# Train the model
X_train = mfcc[:-1].T
y_train = mfcc[-1].T
model = LinearRegression()
model.fit(X_train, y_train)

**Testing the LPC model**

In [None]:
# Load a test audio file
test_audio_file = "/content/M01_Session1_0013.wav"
test_audio_signal, sr = librosa.load(test_audio_file)

# Extract LPC coefficients
test_lpc_coeffs = librosa.lpc(test_audio_signal, order=p)

# Scale the LPC coefficients
test_lpc_coeffs /= np.max(np.abs(test_lpc_coeffs))

# Extract MFCC features
test_power_spec = np.abs(librosa.stft(test_audio_signal))**2
test_mfcc = librosa.feature.mfcc(S=librosa.power_to_db(test_power_spec / np.max(test_power_spec)), n_mfcc=n_mfcc)

# Test the model
X_test = test_mfcc[:-1].T
y_test = test_mfcc[-1].T
y_pred = model.predict(X_test)

**Accuracy**

In [None]:
# Calculate accuracy
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9983317962923006


**Convert all audio sample files to wav format**

In [None]:
def convert_to_wav(filename):
    # load the audio file
    sound = AudioSegment.from_file(filename)

    # set the sample rate to 16000
    sound = sound.set_frame_rate(16000)

    # set the number of channels to 1
    sound = sound.set_channels(1)

    # set the sample width to 2 bytes
    sound = sound.set_sample_width(2)

    # export the audio file as a wav file
    wav_file = filename.split(".")[0] + ".wav"
    sound.export(wav_file, format="wav")

    return wav_file

**Transcribe the speech in an audio file**

In [None]:
import speech_recognition as sr

def transcribe_audio(filename):
    # convert the audio file to wav format
    wav_file = convert_to_wav(filename)

    # create a recognizer object
    r = sr.Recognizer()

    # load the audio file
    with sr.AudioFile(wav_file) as source:
        audio = r.record(source)
                # adjust for ambient noise
        r.adjust_for_ambient_noise(source, duration=0.5)

    # transcribe the speech in the audio file
    try:
        result = r.recognize_google(audio, show_all=True)
        if 'alternative' in result:
            for i, alternative in enumerate(result['alternative']):
                if 'confidence' in alternative:
                    confidence = alternative['confidence']
                    transcript = alternative['transcript']
                    print(f"Alternative {i+1}: {transcript} (confidence: {confidence})")
                else:
                    transcript = alternative['transcript']
                    print(f"Alternative {i+1}: {transcript}")
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))


**Speech to Text Conversion**

In [None]:
transcribe_audio("/content/M01_Session1_0005.wav")

Alternative 1: Wendy Bank in mind a bit crack and go over and triple (confidence: 0.77680421)
Alternative 2: Wendy Bank in mind a bit crack and Coopers I drew for
Alternative 3: Wendy Bank in mind a bit crack and Coopers I dreamed
Alternative 4: Wendy Bank in mind a bit crack and Coopers I dream for
Alternative 5: Wendy Bank in mind a bit crack and Cooper I drew for


In [None]:
transcribe_audio("/content/M01_Session1_0031.wav")

Alternative 1: dog (confidence: 0.60164523)
Alternative 2: guard
Alternative 3: Doug
Alternative 4: Dogg


In [None]:
transcribe_audio("/content/M01_Session1_0092.wav")

Alternative 1: really I love you more and broken (confidence: 0.92686945)
Alternative 2: really I love you more and smoking
Alternative 3: really I love you do like more and smoking
Alternative 4: really I love you more and smoke it
Alternative 5: really I love you more and smoking in


In [None]:
transcribe_audio("/content/M01_Session1_0085.wav")

Alternative 1: YouTube (confidence: 0.8947466)
Alternative 2: it's you
Alternative 3: you see
Alternative 4: you too


In [None]:
transcribe_audio("/content/M01_Session1_0040.wav")

Alternative 1: break it out games (confidence: 0.78133756)
Alternative 2: bring it out games
Alternative 3: bring it on games
Alternative 4: bring it all games
Alternative 5: break it all games
