### Imports

In [75]:
import librosa
import pandas as pd
import numpy as np
import noisereduce as nr
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import re
import string
from num2words import num2words
import re
import soundfile as sf
import torch


### Reading dataset

In [76]:
df=pd.read_csv('Dataset/index.csv')
df

Unnamed: 0,audio_file,text,gender
0,D6x81u_1.mp3,خليني ابدا الفيديو ده وانا بقول ان من,male
1,6f5WmH_2.mp3,ااا ملاحظتي الشخصيه استنتجت ان الستات ما,male
2,abSOwC_3.mp3,بقاش عندها دم,male
3,qqiArB_4.mp3,وان الستات بداوا يستغلوا الرجاله وان,male
4,afzWCf_5.mp3,الستات دول,male
...,...,...,...
408054,0VO9mA_255.mp3,الامراض في كل,female
408055,22yVw5_256.mp3,[موسيقى],female
408056,OrpHs8_257.mp3,مكان,female
408057,HndKfM_258.mp3,[موسيقى],female


### Preprocessing

In [77]:
def transcription_preprocessing(sentence):
    sentence = sentence.lower()
    sentence = sentence.replace("\n", " ")
    sentence = re.sub(r'[إأآ]', 'ا', sentence)
    sentence = re.sub(r'[^a-zA-Zء-ي\s\d]', '', sentence)
    pattern = r'[\u0617-\u061A\u064B-\u065F]'
    sentence = re.sub(pattern, '', sentence)
    pattern = r'([a-zA-Z])([ء-ي])|([ء-ي])([a-zA-Z])'
    sentence = re.sub(pattern, r'\1\3 \2\4', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    sentence = re.sub(r'\d+', lambda x: num2words(int(x.group()), lang='ar'), sentence)
    return sentence

In [78]:
audio_files = set(os.listdir( 'Dataset/data/')) 
csv_audio_ids = set(df['audio_file'].values)
audio_without_transcription = audio_files - csv_audio_ids
print("Audio files without transcriptions:", audio_without_transcription)
transcription_without_audio = csv_audio_ids - audio_files
print("Transcriptions without audio files:", transcription_without_audio)
df.drop(df[df['audio_file']=='لشخصك ولأفكارك الـProgressive،"'].index,inplace = True)
df.drop(df[df['text'] == '[موسيقى]'].index,inplace = True)
df = df[~df['audio_file'].isin(transcription_without_audio)]

Audio files without transcriptions: set()
Transcriptions without audio files: {'xSxG6H_855.mp3', 'rr2jzp_582.mp3', 'ATbwsb_587.mp3', 'UiA9J5_505.mp3', 'xQwTr8_1125.mp3', 'TUMiFk_273.mp3', 'YFErAB_797.mp3', 'jPCdDU_771.mp3', '0c7Edr_515.mp3', 'SL36M8_80.mp3', 'yIsrKI_164.mp3', 'ARS8O6_231.mp3', 'L5uhyH_116.mp3', 'CROdee_544.mp3', 'mQP5jP_1031.mp3', 'gns12l_1017.mp3', 'EaBPS5_358.mp3', 'xQhmtZ_128.mp3', 'Q7hMBO_818.mp3', 'dN1g81_846.mp3', 'IHFQm9_1072.mp3', 'QRJduX_1176.mp3', 'SqAIwn_264.mp3', 'XWMiNO_184.mp3', 'GPuR2P_961.mp3', 'YyUn1z_563.mp3', 'cd4hAV_91.mp3', 'zruuaM_911.mp3', 'F4OGr5_287.mp3', 'ILVnZG_508.mp3', 'dRLOu0_228.mp3', 'gc6nMN_716.mp3', 'NYnCbf_418.mp3', 'gEICeE_568.mp3', 'xBeiMc_978.mp3', 'wieXu4_522.mp3', 'nbkKgO_86.mp3', 'PTkxkk_599.mp3', 'qf6q8H_639.mp3', 'fK3q1m_942.mp3', '0wA8bx_190.mp3', 'v0kwEq_165.mp3', 'wfkSbB_1128.mp3', 'fmCmEC_1173.mp3', 'fZDB7h_99.mp3', 'EV7F6U_665.mp3', 'bKFggJ_59.mp3', 'QV4Y4c_265.mp3', 'LGvSa7_624.mp3', '5C74u2_861.mp3', 'v63k5c_919.mp3', '

In [88]:
data=os.listdir('Dataset/data')
processed=os.listdir('processed_audio')
# processed = [word.replace('processed_', '') for word in processed]
unprocessed_audios = list(set(data) - set(processed))
len(unprocessed_audios)


403754

In [80]:
len(data)-len(unprocessed_audios)

314069

In [81]:
len(processed)

314069

In [82]:
unprocessed_audios

['xhZaox_331.mp3',
 'ZWM7GF_804.mp3',
 '3qZucC_1352.mp3',
 'Aw5R4c_301.mp3',
 'dT7A4a_115.mp3',
 'Mv8GaY_432.mp3',
 'o365zv_150.mp3',
 'z2Sl4y_125.mp3',
 'pkMvDa_382.mp3',
 '69NrKu_83.mp3',
 '2uIYgT_200.mp3',
 'xuKzCM_270.mp3',
 'fsgoHV_819.mp3',
 'M1ABpj_1054.mp3',
 'WOAVz2_272.mp3',
 'C29cGx_2460.mp3',
 'mNYd0f_238.mp3',
 'WrH0KN_9.mp3',
 'bIa51R_139.mp3',
 'cItLb0_323.mp3',
 'SY0Kwk_613.mp3',
 'stpBQc_454.mp3',
 'e2IRLs_114.mp3',
 '1FSZdj_395.mp3',
 'MAsLne_445.mp3',
 'UAm73q_215.mp3',
 'lq07by_391.mp3',
 'CLlhRa_440.mp3',
 'Pp9KnE_378.mp3',
 'iK848T_322.mp3',
 'FZPvZg_181.mp3',
 'ujEQyv_534.mp3',
 '1CFGdj_99.mp3',
 'jOInU2_123.mp3',
 'u3Wh1P_368.mp3',
 'z5g4Zf_1107.mp3',
 'juX2dP_86.mp3',
 'XaGvqJ_260.mp3',
 'u8jKiT_385.mp3',
 '7noixD_137.mp3',
 'anaiba_234.mp3',
 'Ukgmo5_525.mp3',
 'H7TC6A_24.mp3',
 'gQHPep_154.mp3',
 'Xg2lbQ_732.mp3',
 'c8pXh1_2710.mp3',
 'x0l7QW_489.mp3',
 'MSmK26_2437.mp3',
 'ew4i1y_537.mp3',
 'VsRnsQ_221.mp3',
 'svleDL_235.mp3',
 'J1HODx_374.mp3',
 'z5Qkc4_195

In [None]:
output_audio_dir = "processed_audio"
os.makedirs(output_audio_dir, exist_ok=True)

df.set_index('audio_file', inplace=True)
df['mfccs'] = None
df['cleaned_text'] = None
df['normalized_text'] = None
error_files = [] 
print(df.shape)
for audio in tqdm(df.index, desc="Processing Audio Files"):
    audio_path = 'Dataset/data/' + audio
    try:
        signal, rate = librosa.load(audio_path, sr=16000)
        length = len(signal) / rate
        df.at[audio, 'length'] = length
        
        if (length >=3  and length <= 10):
            cleaned_audio = nr.reduce_noise(signal, rate)
            resampled_audio = librosa.resample(cleaned_audio, orig_sr=rate, target_sr=16000)
            normalized_audio = librosa.util.normalize(resampled_audio)
            
        
            output_path = os.path.join(output_audio_dir, f"processed_{audio}")
            sf.write(output_path, normalized_audio, 16000)
            mfccs = librosa.feature.mfcc(y=normalized_audio, sr=16000, n_mfcc=13)
            df.at[audio, 'mfccs'] = mfccs.tolist()

            raw_text = df.at[audio, 'text']
            cleaned_text = transcription_preprocessing(raw_text)
            df.at[audio, 'cleaned_text'] = cleaned_text
        else:
            df.drop(audio, axis=0, inplace=True)
    except Exception as e:
        error_message = str(e)
        tqdm.write(f"Error processing {audio}: {error_message}")
        if os.path.exists(audio_path):
            os.remove(audio_path)
        df.drop(audio, axis=0, inplace=True)
        error_files.append((audio, error_message))

if error_files:
    print("\nFiles with errors:")
    for file, error in error_files:
        print(f"- {file}: {error}")
    print(f"Total files with errors: {len(error_files)}/{len(df.index)}")
    else:
    print("\nAll files processed successfully!")

print(f"\nTotal files after length filtering (5-10 seconds): {len(df)}")

In [None]:
# If prev cell stops or need to relief gpu/mps
import torch
output_audio_dir = "processed_audio"
# Ensure the DataFrame is indexed by 'audio_file'
df.set_index('audio_file', inplace=True)
df['mfccs'] = None
df['cleaned_text'] = None
df['normalized_text'] = None

error_files = [] 
print(df.shape)

# Set device to MPS if available, otherwise fallback to CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Process only unprocessed audios
for audio in tqdm(unprocessed_audios, desc="Processing Unprocessed Audio Files"):
    if audio not in df.index:
        tqdm.write(f"Skipping {audio}: not found in DataFrame")
        continue

    audio_path = os.path.join('Dataset/data', audio)
    try:
        signal, rate = librosa.load(audio_path, sr=16000)
        length = len(signal) / rate
        df.at[audio, 'length'] = length

        if 3 <= length <= 10:
            # Convert signal to a PyTorch tensor and move it to the MPS device
            signal_tensor = torch.tensor(signal, device=device)

            # Perform noise reduction (noisereduce does not support MPS, so keep it on CPU)
            signal_cpu = signal_tensor.cpu().numpy()
            cleaned_audio = nr.reduce_noise(signal_cpu, rate)

            # Resample and normalize using PyTorch
            cleaned_audio_tensor = torch.tensor(cleaned_audio, device=device)
            resampled_audio = librosa.resample(cleaned_audio_tensor.cpu().numpy(), orig_sr=rate, target_sr=16000)
            normalized_audio = librosa.util.normalize(resampled_audio)

            # Save processed audio
            output_path = os.path.join(output_audio_dir, f"processed_{audio}")
            sf.write(output_path, normalized_audio, 16000)

            # Compute MFCCs
            mfccs = librosa.feature.mfcc(y=normalized_audio, sr=16000, n_mfcc=13)
            df.at[audio, 'mfccs'] = mfccs.tolist()
    except Exception as e:
        error_message = str(e)
        tqdm.write(f"Error processing {audio}: {error_message}")
        if os.path.exists(audio_path):
            os.remove(audio_path)
        df.drop(audio, axis=0, inplace=True)
        error_files.append((audio, error_message))

# Report
if error_files:
    print("\nFiles with errors:")
    for file, error in error_files:
        print(f"- {file}: {error}")
    print(f"Total files with errors: {len(error_files)}/{len(df.index)}")
else:
    print("\nAll files processed successfully!")

print(f"\nTotal files after length filtering (3-10 seconds): {len(df)}")


In [122]:
new_df

Unnamed: 0_level_0,length,mfccs
audio_file,Unnamed: 1_level_1,Unnamed: 2_level_1
processed_33rH0T_53.mp3,8.119,
processed_X0SQy5_48.mp3,,
processed_V45UFZ_254.mp3,,
processed_itoiey_1049.mp3,,
processed_n019JV_781.mp3,,
...,...,...
processed_Sz0ANP_407.mp3,,
processed_osOZLr_1418.mp3,,
processed_dJUPwD_165.mp3,,
processed_9dpOzP_368.mp3,,


In [None]:

new_df = pd.DataFrame(processed, columns=['audio_file'])
new_df.set_index('audio_file', inplace=True)

new_df.drop(['length','mfccs'],inplace=True,axis=1)
new_df

processed_33rH0T_53.mp3
processed_X0SQy5_48.mp3
processed_V45UFZ_254.mp3
processed_itoiey_1049.mp3
processed_n019JV_781.mp3
...
processed_Sz0ANP_407.mp3
processed_osOZLr_1418.mp3
processed_dJUPwD_165.mp3
processed_9dpOzP_368.mp3
processed_0CdYXs_354.mp3


In [124]:
print(df.head())
print(new_df.head())

       audio_file                                              text gender
494  T0jbfN_1.mp3                 - اطلع بينا بسرعة!\n- حاضر، حاضر.   male
495  Vb9W33_2.mp3    - بعد إذنك، ممكن تربط الحزام؟\n- حاضر يا سيدي.   male
496  bMHHrj_3.mp3                     - أهو، اتفضل بسرعة.\n- شكرًا.   male
497  lacSQF_4.mp3  - حافظ دعاء الركوب؟\n- دعاء الركوب إيه يا حبيبي؟   male
498  AsW9Mm_5.mp3          احنا كنا بنسرق بنك، مش جايين من العُمرة!   male
Empty DataFrame
Columns: []
Index: [processed_33rH0T_53.mp3, processed_X0SQy5_48.mp3, processed_V45UFZ_254.mp3, processed_itoiey_1049.mp3, processed_n019JV_781.mp3]


In [125]:
processed=pd.merge(new_df,df,how='left',on='audio_file')
print(processed.shape)

(314069, 3)


In [127]:
processed

processed['original_audio_file'] = processed['audio_file'].str.replace('processed_', '')

processed['text'] = processed['original_audio_file'].map(df.set_index('audio_file')['text'])
processed

Unnamed: 0,audio_file,text,gender,original_audio_file
0,processed_33rH0T_53.mp3,الكومنتات واحده بتعيط عشان ايه حامل,,33rH0T_53.mp3
1,processed_X0SQy5_48.mp3,الشخص ده العقل المدبر بحاول جدا ثانكيو,,X0SQy5_48.mp3
2,processed_V45UFZ_254.mp3,where mostly the aim of\nthe first few minutes...,,V45UFZ_254.mp3
3,processed_itoiey_1049.mp3,‫ما دي قلة أدب لأنه دي\n‫السوشيال ميديا ما لها...,,itoiey_1049.mp3
4,processed_n019JV_781.mp3,كانت نفسي اروح بورسعيد الجباري تجيب حاجه,,n019JV_781.mp3
...,...,...,...,...
314064,processed_Sz0ANP_407.mp3,عنه هو بقى المشكله وقتها في هولندا هو,,Sz0ANP_407.mp3
314065,processed_osOZLr_1418.mp3,في ابن كثير وغيره يعني ماشي كتير من,,osOZLr_1418.mp3
314066,processed_dJUPwD_165.mp3,مثلا لو جبت له عربيه لعبه هيسيب كل حاجه,,dJUPwD_165.mp3
314067,processed_9dpOzP_368.mp3,ده ندم الحسره ان هو خلاص الطياره فاتته,,9dpOzP_368.mp3


In [136]:
processed.to_csv('processed.csv')

In [135]:

def compute_audio_features(audio_path):
    signal, rate = librosa.load(audio_path, sr=16000)
    length = len(signal) / rate

    return length

def compute_additional_features(audio_path, audio_filename, df):
    length, mfccs = compute_audio_features(audio_path)

    df.at[audio_filename, 'length'] = length


    df.at[audio_filename, 'cleaned_text'] = transcription_preprocessing(df.at[audio_filename, 'text'])


    return df



for audio_filename in new_df.index:
    audio_path = f"processed_audio/{audio_filename}"
    new_df = compute_additional_features(audio_path, audio_filename, new_df)


TypeError: cannot unpack non-iterable float object

In [101]:
# new_df.rename(columns={0: 'audio_file'}, inplace=True)
new_df

processed_33rH0T_53.mp3
processed_X0SQy5_48.mp3
processed_V45UFZ_254.mp3
processed_itoiey_1049.mp3
processed_n019JV_781.mp3
...
processed_Sz0ANP_407.mp3
processed_osOZLr_1418.mp3
processed_dJUPwD_165.mp3
processed_9dpOzP_368.mp3
processed_0CdYXs_354.mp3


#### Text Preprocessing

In [31]:
special_tokens = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']  
english_characters = list(string.ascii_lowercase + ' ')  
arabic_characters = list("ابتثجحخدذرزسشصضطظعغفقكلمنهويئءىةؤ")  

characters = english_characters + arabic_characters
vocab = special_tokens + characters 

char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

print(vocab)

Vocabulary size: 64
['<PAD>', '<UNK>', '<SOS>', '<EOS>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', ' ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ئ', 'ء', 'ى', 'ة', 'ؤ']


In [36]:
def tokenize_text(text, char2idx, max_len=100, start_token=True, end_token=True):
    tokens = [char2idx.get(char, char2idx['<UNK>']) for char in text]

    if start_token:
        tokens.insert(0, char2idx['<SOS>'])
    if end_token:
        tokens.append(char2idx['<EOS>'])

    if max_len is not None:
        tokens = tokens[:max_len]  
        tokens += [char2idx['<PAD>']] * (max_len - len(tokens))  

    return tokens

df['tokenized_text'] = df['cleaned_text'].apply(lambda x: tokenize_text(x, char2idx, max_len=120))

In [37]:
import tensorflow as tf
def TextDecoder(sentence):
    out = ''
    for token in sentence:
        if isinstance(token, tf.Tensor):
            token = token.numpy().item()
        char = idx2char[token]
        if char == '<EOS>':
            return out
        if not (char in special_tokens):
            out += char
    return out




In [2]:
!pip3 install hf_xet

Defaulting to user installation because normal site-packages is not writeable
Collecting hf_xet
  Downloading hf_xet-1.0.5-cp37-abi3-macosx_11_0_arm64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.5-cp37-abi3-macosx_11_0_arm64.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: hf_xet
Successfully installed hf_xet-1.0.5


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device='mps',cache_folder='embedding_model_cache')  


In [4]:
df['text_embedding'] = None
for idx in tqdm(df.index, desc="Generating embeddings"):

    text = df.at[idx, 'cleaned_text']
    embedding = model.encode(text)
    df.at[idx, 'text_embedding'] = embedding.tolist()


print(f"Embedding shape: {np.array(df['text_embedding'].iloc[0]).shape}")


NameError: name 'df' is not defined

In [None]:
df.to_csv("Preprocessed_with_embeddings.csv")

### Loss