# Data Formatting – Rwanda (Manual Clipping with moviepy)

This notebook loads the **TEACH Final Scores** CSV for *Rwanda*, discovers classroom videos stored on SharePoint, manually splits each full-length video into first and last 15-minute clips using `moviepy`, and saves only these clips to Google Drive. It also attaches clip links to the dataset and exports a final formatted CSV.


### Workflow
1. Detect runtime, install missing packages.
2. Mount / locate Google Drive container.
3. Authenticate to SharePoint using browser cookies.
4. Discover every video from SharePoint Rwanda 2020 folder.
5. Load TEACH CSV dataset and prepare output columns.
6. Manually split each video into first and last 15-minute clips in parallel using `ThreadPoolExecutor`.
7. Attach clip links to the dataset and save final CSV.
8. Log and retry errors, skip videos shorter than 30 minutes.


## Dependencies & Environment Setup

Install required Python packages and configure the environment.

In [None]:
!pip install -q python-dotenv requests pandas moviepy
!pip install -q google-auth google-auth-oauthlib google-auth-httplib2

In [None]:
# Environment detection & dependency install
import importlib.util
import subprocess, sys, os
from pathlib import Path

IN_COLAB = importlib.util.find_spec("google.colab") is not None

def _ensure(pkgs):
    missing = [p for p in pkgs if importlib.util.find_spec(p.replace('-', '_')) is None]
    if missing:
        print("Installing:", missing)
        subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])

_ensure(["python-dotenv", "requests", "pandas", "moviepy"])
if IN_COLAB:
    _ensure(["google-auth", "google-auth-oauthlib", "google-auth-httplib2"])

## Paths & Google Drive Mount

Configure paths for raw data, outputs, and mount Google Drive if running in Colab.

In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    RAW_DIR = Path('/content/drive/My Drive/world bank/data/Rwanda')
else:
    RAW_DIR = Path.cwd()

RAW_CSV = RAW_DIR / 'evals/Teach_Final_Scores_v1(ALL_Scores).csv'
OUTPUT_DIR = RAW_DIR / 'evals/formattedData'
VIDEO_OUTPUT_DIR = RAW_DIR / 'videos'

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
VIDEO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"RAW_CSV: {RAW_CSV}\nOUTPUT_DIR: {OUTPUT_DIR}\nVIDEO_OUTPUT_DIR: {VIDEO_OUTPUT_DIR}")

## SharePoint Authentication

Authenticate to SharePoint using browser cookies stored in the `cookie` environment variable.

In [None]:
from dotenv import load_dotenv
import requests, os

load_dotenv()
cookie_string = os.getenv('cookie')
if not cookie_string:
    raise RuntimeError("Set 'cookie' environment variable with your SharePoint cookies.")
cookies = {kv.split('=')[0]: kv.split('=')[1] for kv in cookie_string.split(';') if '=' in kv}

SP_BASE_URL = 'https://worldbankgroup.sharepoint.com/teams/TeachDashboardVideoLibrary-WBGroup'
SP_FOLDER_PATH = '/teams/TeachDashboardVideoLibrary-WBGroup/Shared Documents/General/Rwanda 2020'
SP_HEADERS = {'Accept': 'application/json;odata=verbose','User-Agent':'Mozilla/5.0','Referer':'https://worldbankgroup.sharepoint.com/'}
r = requests.get(f"{SP_BASE_URL}/_api/web", cookies=cookies, headers=SP_HEADERS)
print(f"SharePoint connection status: {r.status_code}")

## SharePoint Video Discovery

Discover all video files in the specified SharePoint folder.

In [None]:
import time
from pathlib import Path

VIDEO_EXTS = {'.mp4','.MP4','.mov','.MOV','.avi','.AVI','.mts','.MTS'}

def get_file_metadata(server_relative_url):
    url = f"{SP_BASE_URL}/_api/web/GetFileByServerRelativeUrl('{server_relative_url}')"
    r = requests.get(url, cookies=cookies, headers=SP_HEADERS)
    if r.status_code==200:
        data=r.json()['d']
        return {'TimeCreated':data.get('TimeCreated'),'Length':data.get('Length',0),'success':True}
    return {'success':False}

def get_folder_contents(folder_path):
    url = f"{SP_BASE_URL}/_api/web/GetFolderByServerRelativeUrl('{folder_path}')/Files"
    r = requests.get(url, cookies=cookies, headers=SP_HEADERS)
    return r.json().get('d',{}).get('results',[]) if r.status_code==200 else []

def discover_rwanda_videos():
    vids=[]
    print("Scanning SharePoint folder for videos...")
    for f in get_folder_contents(SP_FOLDER_PATH):
        ext=Path(f['Name']).suffix
        if ext in VIDEO_EXTS:
            vids.append({'name':f['Name'],'url':f['ServerRelativeUrl'],'metadata':get_file_metadata(f['ServerRelativeUrl'])})
    print(f"Found {len(vids)} videos.")
    return vids

discovered_videos = discover_rwanda_videos()

## Load TEACH Dataset & Prepare Clip Columns

Load the CSV and ensure `First Video Clip` and `Last Video Clip` columns exist.

In [None]:
import pandas as pd

def load_dataset(path):
    lines=path.read_text(encoding='latin-1').splitlines()
    h1=[h.strip() for h in lines[0].split(',')]
    h2=[h.strip() for h in lines[1].split(',')]
    base=h1[:3]+h2[3:]
    cols,seen=[],{}
    for c in base:
        if not c: c='Unnamed'
        seen[c]=seen.get(c,0)
        cols.append(c if seen[c]==0 else f"{c}_{seen[c]}")
        seen[c]+=1
    return pd.read_csv(path,header=None,skiprows=[0,2],names=cols,encoding='latin-1')

print(f"Loading dataset from {RAW_CSV}")
df=load_dataset(RAW_CSV)
for col in ['First Video Clip','Last Video Clip']:
    if col not in df.columns: df[col]=''
print('Dataset prepared.')

## Manual Clipping Function with Retry and Parallel Execution

Define a function to download, split, and save clips with up to 3 retries, skipping videos shorter than 30 minutes.

In [None]:
import re, tempfile, os
from moviepy.editor import VideoFileClip
from concurrent.futures import ThreadPoolExecutor, as_completed

id_re=re.compile(r"(\\d{6,7})")
errors=[]
skipped=[]

def download_video(srv_rel_url, target):
    url=f"https://worldbankgroup.sharepoint.com{srv_rel_url}?download=1"
    with requests.get(url,cookies=cookies,headers=SP_HEADERS,stream=True) as r:
        r.raise_for_status()
        with open(target,'wb') as f:
            for chunk in r.iter_content(8192):
                f.write(chunk)

def process_video(v):
    name, url = v['name'], v['url']
    m=id_re.search(name)
    if not m:
        errors.append((name,'No ID'))
        return None
    ident=m.group(1)
    for attempt in range(1,4):
        try:
            tmp=tempfile.NamedTemporaryFile(delete=False,suffix=os.path.splitext(name)[1]).name
            download_video(url,tmp)
            clip=VideoFileClip(tmp)
            dur=clip.duration
            if dur<1800:
                skipped.append(name)
                clip.close()
                os.remove(tmp)
                return (ident,None,None)
            c1=clip.subclip(0,900)
            c2=clip.subclip(dur-900,dur)
            out1=VIDEO_OUTPUT_DIR/f"{ident} Clip 1.mp4"
            out2=VIDEO_OUTPUT_DIR/f"{ident} Clip 2.mp4"
            c1.write_videofile(str(out1),codec='libx264',audio_codec='aac',verbose=False,logger=None)
            c2.write_videofile(str(out2),codec='libx264',audio_codec='aac',verbose=False,logger=None)
            clip.close();c1.close();c2.close();os.remove(tmp)
            return (ident,str(out1),str(out2))
        except Exception as e:
            errors.append((name,f"{attempt}: {e}"))
            if attempt==3: return None

results={}
with ThreadPoolExecutor(max_workers=os.cpu_count()) as ex:
    futures={ex.submit(process_video,v):v for v in discovered_videos}
    for f in as_completed(futures):
        r=f.result()
        if r:
            i,o1,o2=r; results[i]={'first':o1,'last':o2}

print(f"Done. Clips: {len(results)}, Skipped: {len(skipped)}, Errors: {len(errors)}")

## Attach Clip Links & Save Final CSV

Populate the DataFrame with clip paths and export the final CSV.

In [None]:
for idx,row in df.iterrows():
    m=id_re.search(str(row.get('School_Clip','')))
    if m:
        ident=m.group(1)
        r=results.get(ident)
        if r:
            if r['first']: df.at[idx,'First Video Clip']=r['first']
            if r['last']: df.at[idx,'Last Video Clip']=r['last']

out_csv=OUTPUT_DIR/'rwanda_manual_clips.csv'
df.to_csv(out_csv,index=False)
print(f"Saved CSV at {out_csv}")