# Data Formatting
This notebook loads the raw TEACH evaluation CSV files and standardizes them according to the column requirements described in the Montesa research plan.

In [15]:
import pandas as pd
from pathlib import Path

In [16]:
# Use paths relative to this notebook's directory
raw_dir = Path('.')
output_dir = Path('..') / 'formattedData'
output_dir.mkdir(exist_ok=True)

def load_dataset(path):
    with path.open('r', encoding='latin-1') as f:
        lines = f.readlines()
    header1 = [h.strip() for h in lines[0].split(',')]
    header2 = [h.strip() for h in lines[1].split(',')]
    base_cols = header1[:3] + header2[3:]
    cols = []
    counts = {}
    for col in base_cols:
        col = col or 'Unnamed'
        if col in counts:
            counts[col] += 1
            cols.append(f'{col}_{counts[col]}')
        else:
            counts[col] = 0
            cols.append(col)
    df = pd.read_csv(path, header=None, skiprows=[0,2], names=cols, encoding='latin-1')
    return df


In [17]:
peru = load_dataset(raw_dir / 'Peru' / 'TEACH_Final_Scores_4 - Peru.csv')
peru['Source Table'] = 'Peru'

rwanda = load_dataset(raw_dir / 'Rwanda' / 'Teach_Final_Scores_v1(ALL_Scores).csv')
rwanda['Source Table'] = 'Rwanda'

df = pd.concat([peru, rwanda], ignore_index=True)

for col in ['Identifier', 'Audio File 1', 'Audio File 2', 'Transcription 1', 'Transcription 2', 'Language', 'Context']:
    if col not in df.columns:
        df[col] = ''
df['Identifier'] = df.index.map(lambda i: f'observation_{i:05d}')

df.to_csv(output_dir / 'montesa_formatted.csv', index=False)
df.head()


Unnamed: 0,Route,School_Clip,Person to Score,Teachear provides learning activity to most students - 1st Snapshot,Students are on task - 1st snapshot,Teachear provides learning activity to most students - 2nd Snapshot,Students are on task - 2nd snapshot,Teachear provides learning activity to most students - 3rd Snapshot,Students are on task - 3rd snapshot,1. Supportive Learning Environment,...,9.3 Students collaborate with one another.,Comments,Unnamed_3,Identifier,Audio File 1,Audio File 2,Transcription 1,Transcription 2,Language,Context
0,,,,Teachear provides learning activity to most st...,Students are on task - 1st snapshot,Teachear provides learning activity to most st...,Students are on task - 2nd snapshot,Teachear provides learning activity to most st...,Students are on task - 3rd snapshot,1. Supportive Learning Environment,...,,,,observation_00000,,,,,,
1,1.0,256305 Clip 1,Carolina,Y,H,Y,H,Y,H,4,...,,,,observation_00001,,,,,,
2,1.0,256305 Clip 2,Carolina,Y,H,N,,Y,M,4,...,,,,observation_00002,,,,,,
3,1.0,1104488 Clip 1,Carolina,Y,H,Y,H,Y,H,4,...,,,,observation_00003,,,,,,
4,1.0,1104488 Clip 2,Carolina,Y,L,N,,Y,M,4,...,,,,observation_00004,,,,,,
