# Data Formatting
This notebook loads the raw TEACH evaluation CSV files and standardizes them according to the column requirements described in the Montesa research plan.

In [9]:
import pandas as pd
from pathlib import Path

In [10]:
# Use paths relative to this notebook's directory
raw_dir = Path('.')
output_dir = Path('..') / 'formattedData'
output_dir.mkdir(exist_ok=True)

def load_dataset(path):
    with path.open('r', encoding='latin-1') as f:
        lines = f.readlines()
    header1 = [h.strip() for h in lines[0].split(',')]
    header2 = [h.strip() for h in lines[1].split(',')]
    base_cols = header1[:3] + header2[3:]
    cols = []
    counts = {}
    for col in base_cols:
        col = col or 'Unnamed'
        if col in counts:
            counts[col] += 1
            cols.append(f'{col}_{counts[col]}')
        else:
            counts[col] = 0
            cols.append(col)
    df = pd.read_csv(path, header=None, skiprows=[0,2], names=cols, encoding='latin-1')
    return df


In [11]:
peru = load_dataset(raw_dir / 'Peru' / 'TEACH_Final_Scores_4 - Peru.csv')
peru['Country of Origin'] = 'Peru'

rwanda = load_dataset(raw_dir / 'Rwanda' / 'Teach_Final_Scores_v1(ALL_Scores).csv')
rwanda['Country of Origin'] = 'Rwanda'

df = pd.concat([peru, rwanda], ignore_index=True)

for col in ['Identifier', 'Audio File 1', 'Audio File 2', 'Transcription 1', 'Transcription 2', 'Language', 'Context']:
    if col not in df.columns:
        df[col] = ''
df['Identifier'] = df.index.map(lambda i: f'observation_{i:05d}')

df.to_csv(output_dir / 'montesa_formatted.csv', index=False)
df.head()


ValueError: Duplicate names are not allowed.