In [138]:
import pandas as pd
import torchaudio 
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 4050 Laptop GPU
Using device: cuda


In [140]:
from transformers import Wav2Vec2ForCTC,Wav2Vec2Processor,Wav2Vec2FeatureExtractor,Wav2Vec2CTCTokenizer

In [142]:
model = Wav2Vec2ForCTC.from_pretrained("E:/lipyantrak/trained_models/wav2vec0.8dropout").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("E:/lipyantrak/processor/wav2vec0.8dropout")

In [143]:
def segmentLargeArray(inputTensor,chunksize=200000):
    # print(inputTensor)
    list_of_segments = []
    tensor_length = inputTensor.shape[1]
    for i in range(0,tensor_length+1,chunksize):
        list_of_segments.append(inputTensor[:,i:i+chunksize])
    return list_of_segments 

In [144]:
csv_file_path = "E:/lipyantrak/csv_files/test.csv"
test_data=pd.read_csv(path)

In [145]:
import csv
import random

def count_lines_and_generate_one_random_number(csv_file_path):
    """
    Count the number of lines in a CSV file and generate one random number within that range.

    Args:
    - csv_file_path (str): Path to the CSV file.

    Returns:
    - tuple: Total line count and a single random number (or None if the file is empty).
    """
    try:
        # Initialize line count
        line_count = 0

        # Open the file and count the lines
        with open(csv_file_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for _ in reader:
                line_count += 1

        print(f"Total number of lines in the CSV file: {line_count}")

        # Generate a single random number within the range of lines
        if line_count > 0:
            random_line_number = random.randint(1, line_count)
            print(f"Random line number: {random_line_number}")
            return line_count, random_line_number
        else:
            print("The file is empty. Cannot generate a random number.")
            return line_count, None
    except FileNotFoundError:
        print(f"Error: The file at '{csv_file_path}' was not found.")
        return 0, None

# Example Usage


In [146]:
random_number = count_lines_and_generate_one_random_number(csv_file_path)
random_number = random_number[1]

Total number of lines in the CSV file: 272
Random line number: 41


In [147]:
test1 = test_data['path'][random_number] 

In [174]:
test1 = "./datasets/test_ne_np_female/Voice15.wav"

In [176]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size = 1, 
    sampling_rate = 16000, 
    padding_value = 0.0, 
    do_normalize = True, 
    return_attention_mask = True
)

In [178]:
vocab_path = './input/cleaned-asr-data/data/vocabulary/vocab.json'
tokenizer = Wav2Vec2CTCTokenizer(
    vocab_path, 
    unk_token = "[UNK]", 
    pad_token = "[PAD]", 
    word_delimiter_token = "|"
)

In [180]:
processor = Wav2Vec2Processor(
    feature_extractor = feature_extractor, 
    tokenizer = tokenizer
)

In [182]:
def predict_from_speech(file):
    speech_array, sampling_rate = torchaudio.load(file)
    # print(speech_array,sampling_rate)
    resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
    resampled_array = resampler(speech_array).squeeze()
    if len(resampled_array.shape) == 1:
        resampled_array = resampled_array.reshape([1,resampled_array.shape[0]])
    # print(resampled_array.shape[1])
    if resampled_array.shape[1] >= 200000:
        # print('The input file is longer than 10 seconds')
        list_of_segments = segmentLargeArray(resampled_array)
        # print(list_of_segments)
        output = ''
        for segment in list_of_segments:
            logits = model(segment.to("cuda")).logits
            pred_ids = torch.argmax(logits,dim=-1)[0]
            output += processor.decode(pred_ids)
        print(f"Prediction:\n{output}")
    else:
        # print('The input file is less than 10 seconds')
        logits = model(resampled_array.to("cuda")).logits
        # print(logits)
        pred_ids = torch.argmax(logits, dim = -1)[0]
        print("Prediction:")
        # print(processor.decode(pred_ids))
    rdecoded_text = processor.decode(pred_ids)
    return rdecoded_text

In [184]:
from indicnlp.tokenize import indic_tokenize
# predict_from_speech(test_audio_ip1)
predicted_text = predict_from_speech(test1)
predicted_text = predicted_text.replace("[UNK]", "")

def segment_text(text):
    return ' '.join(indic_tokenize.trivial_tokenize(text))

segmented_prediction = segment_text(predicted_text)
print(segmented_prediction)

Prediction:
लागिसञ्चारकामाद्यमलेऔहमभूमीकाखेलेकोछपत्रपत्रिकाकोअभावमाव्यक्तितथाराष्ट्रलेराजनीतिखेलकुदकृषिरअन्यकुराहरूथापाउनसक्दैन[UNK][UNK]



In [166]:
def get_label_from_csv(csv_filename, path_to_find):
    df = pd.read_csv(csv_filename)
    label = df.loc[df['path'] == path_to_find, 'labels'].values[0]
    return label

In [168]:
ref = test_data['labels'][random_number]

In [170]:
ref

'पगनाथ दैलेख जिल्लाको एउटा गाविसको नाम'

In [172]:
ref = pd.read_csv("E:/lipyantrak/datasets/archive/Nepali_Speech_To_Text_Dataset/transcripts/audio_transcript.csv")
ref = "E:/lipyantrak/datasets/archive/Nepali_Speech_To_Text_Dataset/transcripts/audio_transcript.csv/2079-11-21_15.wav"

In [136]:
# ref = get_label_from_csv(csv_file, path_to_find)
print(ref)

E:/lipyantrak/datasets/archive/Nepali_Speech_To_Text_Dataset/transcripts/audio_transcript.csv/2079-11-21_15.wav


In [114]:
from evaluate import load

# Load the WER and CER metrics
wer_metric = load("wer")
cer_metric = load("cer")

def compute_metrics(pred_text, ref_text):
    """
    Computes WER and CER given predicted and reference texts.
    
    Args:
        pred_text (str): The predicted transcription.
        ref_text (str): The ground truth transcription.

    Returns:
        dict: A dictionary with WER, CER, and adjusted CER ("cer_best").
    """
    # Predictions and references should be lists of strings for the metrics
    pred_str = [pred_text]  # Wrap in a list as required by the metric
    ref_str = [ref_text]    # Wrap in a list as required by the metric

    # Compute WER and CER
    wer = wer_metric.compute(predictions=pred_str, references=ref_str)
    cer = cer_metric.compute(predictions=pred_str, references=ref_str)

    # Adjust CER for best model selection (optional logic)
    # cer_best = 1 - cer

    return {"wer": wer, "cer": cer}

In [115]:
pred_text = test1
ref_text = ref
# Compute WER and CER
metrics = compute_metrics(pred_text, ref_text)
print(metrics)


{'wer': 1.0, 'cer': 1.0}
