In [3]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 15.5 MB/s eta 0:00:01
[?25hCollecting numpy>=1.22.4
  Downloading numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[K     |████████████████████████████████| 18.2 MB 116.5 MB/s eta 0:00:01
[?25hCollecting pytz>=2020.1
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[K     |████████████████████████████████| 505 kB 102.8 MB/s eta 0:00:01
[?25hCollecting tzdata>=2022.7
  Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[K     |████████████████████████████████| 345 kB 106.9 MB/s eta 0:00:01
Installing collected packages: tzdata, pytz, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.2.2 pytz-2024.1 tzdata-2024.1
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
import csv
import pandas as pd

# Directory paths
small_talk_wav_dir = 'dataset_amazigh/wav/conv_wav'
annotations_dir = 'dataset_amazigh/annotations'

# Create the annotations directory if it doesn't exist
os.makedirs(annotations_dir, exist_ok=True)

# List of translations for general small talk phrases
small_talk_translations = [
    ("conv_1.wav", "مرحبًا! كيف حالك؟", "Hello! How are you?"),
    ("conv_2.wav", "أنا بخير، شكرًا. وأنت؟", "I'm fine, thank you. And you?"),
    ("conv_3.wav", "ما اسمك؟", "What's your name?"),
    ("conv_4.wav", "اسمي [اسم]. سعيد بلقائك.", "My name is [Name]. Nice to meet you."),
    ("conv_5.wav", "من أين أنت؟", "Where are you from?"),
    ("conv_6.wav", "أنا من [البلد].", "I'm from [Country]."),
    ("conv_7.wav", "ماذا تعمل؟", "What do you do for a living?"),
    ("conv_8.wav", "أعمل كـ [مهنة].", "I work as a [Profession]."),
    ("conv_9.wav", "هل تحب العيش هنا؟", "Do you like living here?"),
    ("conv_10.wav", "نعم، إنه مكان جميل.", "Yes, it's a beautiful place."),
    ("conv_11.wav", "ما هي هواياتك؟", "What are your hobbies?"),
    ("conv_12.wav", "أستمتع بالقراءة والسفر.", "I enjoy reading and traveling."),
    ("conv_13.wav", "ما هو طعامك المفضل؟", "What's your favorite food?"),
    ("conv_14.wav", "أحب [الطعام].", "I love [Food]."),
    ("conv_15.wav", "هل لديك حيوانات أليفة؟", "Do you have any pets?"),
    ("conv_16.wav", "نعم، لدي كلب.", "Yes, I have a dog."),
    ("conv_17.wav", "كيف كان يومك؟", "How was your day?"),
    ("conv_18.wav", "كان جيدًا، شكرًا.", "It was good, thank you."),
    ("conv_19.wav", "ما هي خططك لعطلة نهاية الأسبوع؟", "What are your plans for the weekend?"),
    ("conv_20.wav", "أخطط لزيارة عائلتي.", "I'm planning to visit my family."),
    ("conv_21.wav", "هل زرت هذا المكان من قبل؟", "Have you visited this place before?"),
    ("conv_22.wav", "نعم، زرته العام الماضي.", "Yes, I visited it last year."),
    ("conv_23.wav", "ما هو فيلمك المفضل؟", "What's your favorite movie?"),
    ("conv_24.wav", "أحب فيلم [الفيلم].", "I love the movie [Movie]."),
    ("conv_25.wav", "ما هو كتابك المفضل؟", "What's your favorite book?"),
    ("conv_26.wav", "أحب كتاب [الكتاب].", "I love the book [Book]."),
    ("conv_27.wav", "كيف هو الطقس اليوم؟", "How is the weather today?"),
    ("conv_28.wav", "الطقس جميل ومشمس.", "The weather is beautiful and sunny."),
    ("conv_29.wav", "هل تحب الرياضة؟", "Do you like sports?"),
    ("conv_30.wav", "نعم، أحب كرة القدم.", "Yes, I love football."),
    ("conv_31.wav", "ما هي مدينتك المفضلة؟", "What's your favorite city?"),
    ("conv_32.wav", "أحب مدينة [المدينة].", "I love the city [City]."),
    ("conv_33.wav", "هل تحب السفر؟", "Do you like traveling?"),
    ("conv_34.wav", "نعم، أحب استكشاف أماكن جديدة.", "Yes, I love exploring new places."),
    ("conv_35.wav", "ما هو لونك المفضل؟", "What's your favorite color?"),
    ("conv_36.wav", "أحب اللون الأزرق.", "I love the color blue."),
    ("conv_37.wav", "هل تعرف أي مطاعم جيدة هنا؟", "Do you know any good restaurants around here?"),
    ("conv_38.wav", "نعم، يوجد مطعم رائع في هذا الشارع.", "Yes, there's a great restaurant down this street."),
    ("conv_39.wav", "هل لديك إخوة أو أخوات؟", "Do you have any siblings?"),
    ("conv_40.wav", "نعم، لدي أخت وأخ.", "Yes, I have a sister and a brother."),
    ("conv_41.wav", "ما هو نوع الموسيقى الذي تفضله؟", "What kind of music do you like?"),
    ("conv_42.wav", "أحب الموسيقى الكلاسيكية.", "I love classical music."),
]

# Adding S1, S2, and S3 variations
sets = ["S1", "S2", "S3"]
all_small_talk_translations = []

for set_prefix in sets:
    for idx, translation in enumerate(small_talk_translations):
        filename = f"{set_prefix}_{translation[0]}"
        arabic_translation, english_translation = translation[1], translation[2]
        all_small_talk_translations.append((filename, arabic_translation, english_translation))

# Filter out files that do not exist
valid_small_talk_entries = []
for entry in all_small_talk_translations:
    file_path = os.path.join(small_talk_wav_dir, entry[0])
    if os.path.exists(file_path):
        valid_small_talk_entries.append(entry)
    else:
        print(f"File not found: {file_path}")

# Write the valid entries to the CSV file
csv_path = os.path.join(annotations_dir, 'small_talk.csv')
with open(csv_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["filename", "translation_arabic", "translation_english"])
    writer.writerows(valid_small_talk_entries)

# Output the table as a pandas DataFrame
df = pd.DataFrame(valid_small_talk_entries, columns=["filename", "translation_arabic", "translation_english"])
print(df)

print(f"\nCSV file created at: {csv_path}")

File not found: dataset_amazigh/wav/conv_wav/S1_conv_12.wav
File not found: dataset_amazigh/wav/conv_wav/S1_conv_13.wav
File not found: dataset_amazigh/wav/conv_wav/S1_conv_14.wav
File not found: dataset_amazigh/wav/conv_wav/S1_conv_27.wav
File not found: dataset_amazigh/wav/conv_wav/S1_conv_34.wav
File not found: dataset_amazigh/wav/conv_wav/S1_conv_36.wav
File not found: dataset_amazigh/wav/conv_wav/S1_conv_42.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_11.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_12.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_14.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_15.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_18.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_19.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_20.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_21.wav
File not found: dataset_amazigh/wav/conv_wav/S2_conv_22.wav
File not found: dataset_amazigh/wav/conv