## Remove Bismillah

In [50]:
import pandas as pd

In [51]:
df = pd.read_csv('Arabic-Original.csv', header=None, names=['surah_number', 'aayah_number', 'transcription'], delimiter='|')

df.head(10)

Unnamed: 0,surah_number,aayah_number,transcription
0,1,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ
1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1,3,الرَّحْمَٰنِ الرَّحِيمِ
3,1,4,مَالِكِ يَوْمِ الدِّينِ
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
5,1,6,اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ
6,1,7,صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ...
7,2,1,بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ الم
8,2,2,ذَٰلِكَ الْكِتَابُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى ...
9,2,3,الَّذِينَ يُؤْمِنُونَ بِالْغَيْبِ وَيُقِيمُونَ...


In [52]:
df.shape

(6236, 3)

In [53]:
df = df.query('aayah_number != 1')

In [54]:
df.shape

(6122, 3)

In [55]:
df.to_csv('Arabic-Original-no-bismillah.csv', index=False)

## Create transcription dataset file

In [56]:
import pandas as pd
import os
import glob
import librosa

In [57]:
df = pd.read_csv('Arabic-Original-no-bismillah.csv')
df.head(10)

Unnamed: 0,surah_number,aayah_number,transcription
0,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
1,1,3,الرَّحْمَٰنِ الرَّحِيمِ
2,1,4,مَالِكِ يَوْمِ الدِّينِ
3,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
4,1,6,اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ
5,1,7,صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ...
6,2,2,ذَٰلِكَ الْكِتَابُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى ...
7,2,3,الَّذِينَ يُؤْمِنُونَ بِالْغَيْبِ وَيُقِيمُونَ...
8,2,4,وَالَّذِينَ يُؤْمِنُونَ بِمَا أُنْزِلَ إِلَيْك...
9,2,5,أُولَٰئِكَ عَلَىٰ هُدًى مِنْ رَبِّهِمْ ۖ وَأُو...


In [58]:
df.shape

(6122, 3)

In [63]:
base_dir = 'F:\\Productivity\\Work\\Furqan\\data\\bismillah'

In [None]:
imam_items = os.listdir(base_dir)
imam_dirs = [item for item in imam_items if os.path.isdir(os.path.join(base_dir, item))]
imam_dirs

In [84]:
def get_aayah_df(imam_dir):
    aayah_files = glob.glob(os.path.join(base_dir, imam_dir, '*.mp3'))
    df = pd.DataFrame(aayah_files, columns=['file_path'])
    df = pd.DataFrame(df['file_path'].str.split('\\', expand=True)[7])
    df = pd.DataFrame(df[7].str.replace('.mp3', '').str.split('_', expand=True),).rename(columns={0: 'surah_number', 1: 'aayah_number'})
    df['file_path'] = aayah_files 
    df['surah_number'] = df['surah_number'].astype(int)
    df['aayah_number'] = df['aayah_number'].astype(int)
    return df

def get_voice_length(file_path):
    y, sr = librosa.load(file_path)
    return librosa.get_duration(y=y, sr=sr)

In [91]:
combined_df = None

for i in range(len(imam_dirs)):
    df_imam = get_aayah_df(imam_dirs[i])

    df_imam = pd.merge(df, df_imam, on=['surah_number', 'aayah_number'])

    if combined_df is None:
        combined_df = df_imam
        
    else:
        combined_df = pd.concat([combined_df, df_imam], ignore_index=True)

In [92]:
combined_df.head(10)

Unnamed: 0,surah_number,aayah_number,transcription,file_path
0,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,F:\Productivity\Work\Furqan\data\bismillah\Abd...
1,1,3,الرَّحْمَٰنِ الرَّحِيمِ,F:\Productivity\Work\Furqan\data\bismillah\Abd...
2,1,4,مَالِكِ يَوْمِ الدِّينِ,F:\Productivity\Work\Furqan\data\bismillah\Abd...
3,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,F:\Productivity\Work\Furqan\data\bismillah\Abd...
4,1,6,اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ,F:\Productivity\Work\Furqan\data\bismillah\Abd...
5,1,7,صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ...,F:\Productivity\Work\Furqan\data\bismillah\Abd...
6,2,2,ذَٰلِكَ الْكِتَابُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى ...,F:\Productivity\Work\Furqan\data\bismillah\Abd...
7,2,3,الَّذِينَ يُؤْمِنُونَ بِالْغَيْبِ وَيُقِيمُونَ...,F:\Productivity\Work\Furqan\data\bismillah\Abd...
8,2,4,وَالَّذِينَ يُؤْمِنُونَ بِمَا أُنْزِلَ إِلَيْك...,F:\Productivity\Work\Furqan\data\bismillah\Abd...
9,2,5,أُولَٰئِكَ عَلَىٰ هُدًى مِنْ رَبِّهِمْ ۖ وَأُو...,F:\Productivity\Work\Furqan\data\bismillah\Abd...


In [96]:
combined_df.shape

(147804, 4)

In [None]:
final_df = None

In [98]:
combined_df['voice_length'] = None

In [None]:
for i in range(len(combined_df)):
    file_path = combined_df.iloc[i]['file_path']
    try:
        voice_length = get_voice_length(file_path)
        combined_df.at[i, 'voice_length'] = voice_length
    except Exception as e:
        print(f"Error in file: {file_path}")
        continue

In [None]:
combined_df['audio_length'] = combined_df['file_path'].apply(get_voice_length)

In [None]:
combined_df.head(10)

In [None]:
combined_df.to_csv('Arabic-Original-no-bismillah-audio-length.csv', index=False)