# MTA Direction Auto-Labeler for NER Training

This notebook auto-labels directions in MTA bus and subway alerts using rule-based pattern matching.

## Label Types:
- **COMPASS**: NORTHBOUND, SOUTHBOUND, EASTBOUND, WESTBOUND
- **BOROUGH**: MANHATTAN_BOUND, QUEENS_BOUND, BRONX_BOUND, BROOKLYN_BOUND, STATENISLAND_BOUND
- **LOCAL**: UPTOWN, DOWNTOWN
- **PLACE_BOUND**: Any other location-based direction
- **BOTH_DIRECTIONS**: Both/either direction phrases
- **UNSPECIFIED**: No direction detected


In [17]:
import re
import csv
import json
import pandas as pd
from collections import defaultdict


## 1. Direction Classifier

Classifies detected direction targets into compass, borough, or place-bound labels.


In [18]:
class DirectionClassifier:
    """Classifies direction targets according to precedence rules."""
    
    COMPASS_TERMS = {'north', 'south', 'east', 'west'}
    BOROUGH_TERMS = {'manhattan', 'queens', 'bronx', 'brooklyn', 'staten island'}
    LOCAL_TERMS = {'uptown', 'downtown'}
    
    COMPASS_LABELS = {
        'north': 'NORTHBOUND',
        'south': 'SOUTHBOUND',
        'east': 'EASTBOUND',
        'west': 'WESTBOUND'
    }
    
    BOROUGH_LABELS = {
        'manhattan': 'MANHATTAN_BOUND',
        'queens': 'QUEENS_BOUND',
        'bronx': 'BRONX_BOUND',
        'brooklyn': 'BROOKLYN_BOUND',
        'staten island': 'STATENISLAND_BOUND'
    }
    
    LOCAL_LABELS = {
        'uptown': 'UPTOWN',
        'downtown': 'DOWNTOWN'
    }
    
    @classmethod
    def classify(cls, target_text):
        """
        Classify a direction target into COMPASS, BOROUGH, LOCAL, or PLACE_BOUND.
        
        Args:
            target_text: The extracted target string
        
        Returns:
            str: The normalized direction label
        """
        target_normalized = target_text.strip().lower()
        
        # Precedence 1: Check compass directions
        if target_normalized in cls.COMPASS_TERMS:
            return cls.COMPASS_LABELS[target_normalized]
        
        # Precedence 2: Check borough names
        if target_normalized in cls.BOROUGH_TERMS:
            return cls.BOROUGH_LABELS[target_normalized]
        
        # Precedence 3: Check local directions
        if target_normalized in cls.LOCAL_TERMS:
            return cls.LOCAL_LABELS[target_normalized]
        
        # Precedence 4: Everything else is PLACE_BOUND
        return 'PLACE_BOUND'


## 2. Direction Detector

Implements pattern matching rules for detecting direction mentions.


In [19]:
class DirectionDetector:
    """Detects direction mentions in MTA alert headers."""
    
    # Rejection patterns - be very conservative to avoid false rejections
    # Only reject clearly invalid patterns
    REJECTION_PATTERNS = [
        r'\bin\s+both\s+directions?\s+of\b',  # "in both direction of X" (invalid)
        # Note: "Westbound to Manhattan" is VALID - the direction IS "Westbound"
        # Note: "due to", "because of" appearing AFTER direction are OK
    ]
    

    
    def __init__(self):
        """Initialize the detector with compiled patterns."""
        self.classifier = DirectionClassifier()
        
        # Specific patterns for boroughs (highest priority)
        # Allow matching after non-letters (spaces, digits, punctuation, newlines, etc.)
        # Use MULTILINE flag so ^ matches at start of lines (after \n)
        # Use (?:\b|(?=[A-Z0-9])) to match cases like "queensboundQ42"
        self.borough_patterns = [
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Manhattan[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'MANHATTAN_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Queens[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'QUEENS_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Bronx[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'BRONX_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Brooklyn[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'BROOKLYN_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Staten\s+Island[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'STATENISLAND_BOUND'),
        ]
        
        # Specific patterns for local directions (Uptown/Downtown)
        self.local_patterns = [
            (re.compile(r'\bUptown\b', re.IGNORECASE), 'UPTOWN'),
            (re.compile(r'\bDowntown\b', re.IGNORECASE), 'DOWNTOWN'),
        ]
        
        # Specific patterns for compass directions
        # Use MULTILINE flag so ^ matches after newlines (e.g., "\nWestbound")
        # Use (?:\b|(?=[A-Z0-9])) instead of \b to match "westboundQ42" (no space before route code)
        self.compass_patterns = [
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))North[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'NORTHBOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))South[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'SOUTHBOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))East[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'EASTBOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))West[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'WESTBOUND'),
        ]
        
        # Place-bound pattern: captures single word before bound
        # For hyphenated: captures word before hyphen (e.g., "Av-bound" -> "Av")
        # For space: captures single word before space (e.g., "Park bound" -> "Park")
        self.place_bound_pattern = re.compile(
            r'\b([A-Za-z]+)[-\s]bound\b',
            re.IGNORECASE
        )
        
        # R2: Both/either direction(s)
        self.both_either_pattern = re.compile(
            r'\b(?:both|either)\s+directions?\b',
            re.IGNORECASE
        )
    
    def _check_rejection_context(self, text, match_start, match_end):
        """
        Check if match appears in a rejected context.
        
        Only checks BEFORE the match, not after, to avoid false rejections.
        For example, "Westbound buses are detoured due to..." should NOT be rejected
        just because "due to" appears after the direction mention.
        
        Args:
            text: Full header text
            match_start: Start position of match
            match_end: End position of match
            
        Returns:
            bool: True if should reject, False otherwise
        """
        # Only check context BEFORE the match (within the match itself)
        # This catches cases like "bound from X" or "bound to Y" 
        # but not "Westbound buses... due to X" which is valid
        context_start = max(0, match_start - 10)
        context = text[context_start:match_end + 5]  # Small window after for immediate context
        
        for pattern in self.REJECTION_PATTERNS:
            if re.search(pattern, context, re.IGNORECASE):
                return True
        
        return False
    

    def _has_overlap(self, start, end, detected_ranges):
        """Check if range [start, end) overlaps with any detected range."""
        for ds, de in detected_ranges:
            # Check for any overlap
            if not (end <= ds or start >= de):
                return True
        return False
    
    def detect_bound_directions(self, text):
        """
        Detect X-bound / X bound patterns.
        
        Uses priority order: Borough > Local > Compass > Place-bound
        
        Args:
            text: Header text to analyze
            
        Returns:
            list: List of detection dicts
        """
        detections = []
        detected_ranges = []  # Track detected ranges to avoid overlaps
        
        # Priority 1: Check for borough directions
        for pattern, label in self.borough_patterns:
            for match in pattern.finditer(text):
                start_pos = match.start()
                end_pos = match.end()
                
                # Skip if overlaps with already detected
                if self._has_overlap(start_pos, end_pos, detected_ranges):
                    continue
                
                # Check rejection patterns
                if self._check_rejection_context(text, start_pos, end_pos):
                    continue
                
                detected_ranges.append((start_pos, end_pos))
                detections.append({
                    'text': match.group(0),
                    'start': start_pos,
                    'end': end_pos,
                    'label': label
                })
        
        # Priority 2: Check for local directions
        for pattern, label in self.local_patterns:
            for match in pattern.finditer(text):
                start_pos = match.start()
                end_pos = match.end()
                
                # Skip if overlaps with already detected
                if self._has_overlap(start_pos, end_pos, detected_ranges):
                    continue
                
                # Check rejection patterns
                if self._check_rejection_context(text, start_pos, end_pos):
                    continue
                
                detected_ranges.append((start_pos, end_pos))
                detections.append({
                    'text': match.group(0),
                    'start': start_pos,
                    'end': end_pos,
                    'label': label
                })
        
        # Priority 3: Check for compass directions
        for pattern, label in self.compass_patterns:
            for match in pattern.finditer(text):
                start_pos = match.start()
                end_pos = match.end()
                
                # Skip if overlaps with already detected
                if self._has_overlap(start_pos, end_pos, detected_ranges):
                    continue
                
                # Check rejection patterns
                if self._check_rejection_context(text, start_pos, end_pos):
                    continue
                
                detected_ranges.append((start_pos, end_pos))
                detections.append({
                    'text': match.group(0),
                    'start': start_pos,
                    'end': end_pos,
                    'label': label
                })
        
        # Priority 4: Check for place-bound directions
        for match in self.place_bound_pattern.finditer(text):
            start_pos = match.start()
            end_pos = match.end()
            
            # Skip if overlaps with already detected
            if self._has_overlap(start_pos, end_pos, detected_ranges):
                continue
            
            # Check rejection patterns
            if self._check_rejection_context(text, start_pos, end_pos):
                continue
            
            detected_ranges.append((start_pos, end_pos))
            detections.append({
                'text': match.group(0),
                'start': start_pos,
                'end': end_pos,
                'label': 'PLACE_BOUND'
            })
        
        return detections
    
    def detect_both_either(self, text):
        """
        Detect "both/either direction(s)" patterns (R2).
        
        Args:
            text: Header text to analyze
            
        Returns:
            list: List of detection dicts
        """
        detections = []
        
        for match in self.both_either_pattern.finditer(text):
            matched_text = match.group(0)
            
            detections.append({
                'text': matched_text,
                'start': match.start(),
                'end': match.end(),
                'label': 'BOTH_DIRECTIONS'
            })
        
        return detections
    
    def detect_all(self, text):
        """
        Run all detection rules on text.
        
        Args:
            text: Header text to analyze
            
        Returns:
            dict: Detection results
        """
        all_detections = []
        
        # R1: Bound patterns
        bound_detections = self.detect_bound_directions(text)
        all_detections.extend(bound_detections)
        
        # R2: Both/either
        both_either_detections = self.detect_both_either(text)
        all_detections.extend(both_either_detections)
        
        return {'detections': all_detections}


## 3. Process Dataset

Load the input data and apply direction detection.


# Configuration
INPUT_FILE = 'Preprocessed/MTA_Data_preprocessed.csv'
OUTPUT_FILE = 'Preprocessed/MTA_Data_silver_directions.csv'

# Initialize detector
detector = DirectionDetector()


In [20]:
# Configuration
INPUT_FILE = 'Preprocessed/MTA_Data_preprocessed.csv'
OUTPUT_FILE = 'Preprocessed/MTA_Data_silver_directions.csv'

# Load input data
print(f"Reading {INPUT_FILE}...")
df_input = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df_input):,} records")

# Initialize detector
detector = DirectionDetector()


Reading Preprocessed/MTA_Data_preprocessed.csv...
Loaded 230,012 records


In [21]:
# Process all headers
print("Processing headers...")

# Initialize new columns for the dataframe
direction_spans_list = []
direction_labels_list = []

for idx, row in df_input.iterrows():
    if (idx + 1) % 10000 == 0:
        print(f"  Processed {idx+1:,} / {len(df_input):,} records...")
    
    header_text = row['Header']
    
    # Handle missing or invalid headers (NaN, float, etc.)
    if pd.isna(header_text):
        direction_spans_list.append(json.dumps([]))
        direction_labels_list.append(json.dumps(['UNSPECIFIED']))
        continue
    
    # Convert to string if not already
    header_text = str(header_text)
    
    # Detect directions
    result = detector.detect_all(header_text)
    detections = result['detections']
    
    # Build output
    if detections:
        direction_spans = []
        direction_labels = []
        
        for detection in detections:
            direction_spans.append({
                'text': detection['text'],
                'start': detection['start'],
                'end': detection['end']
            })
            direction_labels.append(detection['label'])
        
        direction_spans_list.append(json.dumps(direction_spans))
        direction_labels_list.append(json.dumps(direction_labels))
    else:
        # No direction detected - label as UNSPECIFIED
        direction_spans_list.append(json.dumps([]))
        direction_labels_list.append(json.dumps(['UNSPECIFIED']))

# Add new columns to the dataframe
df_input['direction_spans'] = direction_spans_list
df_input['direction_labels'] = direction_labels_list

print(f"Completed processing {len(df_input):,} records")


Processing headers...
  Processed 10,000 / 230,012 records...
  Processed 20,000 / 230,012 records...
  Processed 30,000 / 230,012 records...
  Processed 40,000 / 230,012 records...
  Processed 50,000 / 230,012 records...
  Processed 60,000 / 230,012 records...
  Processed 70,000 / 230,012 records...
  Processed 80,000 / 230,012 records...
  Processed 90,000 / 230,012 records...
  Processed 100,000 / 230,012 records...
  Processed 110,000 / 230,012 records...
  Processed 120,000 / 230,012 records...
  Processed 130,000 / 230,012 records...
  Processed 140,000 / 230,012 records...
  Processed 150,000 / 230,012 records...
  Processed 160,000 / 230,012 records...
  Processed 170,000 / 230,012 records...
  Processed 180,000 / 230,012 records...
  Processed 190,000 / 230,012 records...
  Processed 200,000 / 230,012 records...
  Processed 210,000 / 230,012 records...
  Processed 220,000 / 230,012 records...
  Processed 230,000 / 230,012 records...
Completed processing 230,012 records


## 4. Generate Statistics

Analyze the detection results and show samples.


In [22]:
# Generate statistics
stats = {
    'total_records': len(df_input),
    'label_counts': defaultdict(int)
}

samples_by_label = defaultdict(list)

for idx, row in df_input.iterrows():
    labels = json.loads(row['direction_labels'])
    spans = json.loads(row['direction_spans'])
    
    for label in labels:
        stats['label_counts'][label] += 1
        
        # Collect samples (first 10 per label)
        if len(samples_by_label[label]) < 10:
            samples_by_label[label].append({
                'record_id': row['Alert ID'],
                'header': row['Header'],
                'spans': spans
            })

# Print statistics
print("\n" + "="*70)
print("DIRECTION LABELING STATISTICS")
print("="*70)

print(f"\nTotal records processed: {stats['total_records']:,}")

print("\n--- Label Distribution ---")

# Group by type
compass_labels = ['NORTHBOUND', 'SOUTHBOUND', 'EASTBOUND', 'WESTBOUND']
borough_labels = ['MANHATTAN_BOUND', 'QUEENS_BOUND', 'BRONX_BOUND', 'BROOKLYN_BOUND', 'STATENISLAND_BOUND']
local_labels = ['UPTOWN', 'DOWNTOWN']

compass_count = sum(stats['label_counts'][l] for l in compass_labels)
borough_count = sum(stats['label_counts'][l] for l in borough_labels)
local_count = sum(stats['label_counts'][l] for l in local_labels)
place_count = stats['label_counts']['PLACE_BOUND']
both_count = stats['label_counts']['BOTH_DIRECTIONS']
unspecified_count = stats['label_counts']['UNSPECIFIED']

print(f"\nCOMPASS directions: {compass_count:,}")
for label in compass_labels:
    count = stats['label_counts'][label]
    if count > 0:
        print(f"  {label}: {count:,}")

print(f"\nBOROUGH directions: {borough_count:,}")
for label in borough_labels:
    count = stats['label_counts'][label]
    if count > 0:
        print(f"  {label}: {count:,}")

print(f"\nLOCAL directions: {local_count:,}")
for label in local_labels:
    count = stats['label_counts'][label]
    if count > 0:
        print(f"  {label}: {count:,}")

print(f"\nPLACE_BOUND directions: {place_count:,}")
print(f"BOTH_DIRECTIONS: {both_count:,}")
print(f"UNSPECIFIED: {unspecified_count:,}")

print("\n" + "="*70)
print("SAMPLE DETECTIONS (First 5 per label)")
print("="*70)

for label in sorted(samples_by_label.keys()):
    samples = samples_by_label[label][:5]
    print(f"\n{label}:")
    for i, sample in enumerate(samples, 1):
        header_preview = sample['header'][:100] + '...' if len(sample['header']) > 100 else sample['header']
        print(f"  {i}. [{sample['record_id']}] {header_preview}")
        if sample['spans']:
            span_texts = [s['text'] for s in sample['spans']]
            print(f"     Spans: {span_texts}")

# Save statistics to CSV file
stats_output = []
stats_output.append({"Category": "Total Records", "Count": stats["total_records"]})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "COMPASS Directions", "Count": compass_count})
for label in compass_labels:
    count = stats["label_counts"][label]
    if count > 0:
        stats_output.append({"Category": f"  {label}", "Count": count})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "BOROUGH Directions", "Count": borough_count})
for label in borough_labels:
    count = stats["label_counts"][label]
    if count > 0:
        stats_output.append({"Category": f"  {label}", "Count": count})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "LOCAL Directions", "Count": local_count})
for label in local_labels:
    count = stats["label_counts"][label]
    if count > 0:
        stats_output.append({"Category": f"  {label}", "Count": count})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "PLACE_BOUND Directions", "Count": place_count})
stats_output.append({"Category": "BOTH_DIRECTIONS", "Count": both_count})
stats_output.append({"Category": "UNSPECIFIED", "Count": unspecified_count})

# Save to CSV
STATS_OUTPUT_FILE = "EDA/direction_labeling_statistics.csv"
df_stats = pd.DataFrame(stats_output)
df_stats.to_csv(STATS_OUTPUT_FILE, index=False)
print(f"\n✓ Statistics saved to {STATS_OUTPUT_FILE}")



DIRECTION LABELING STATISTICS

Total records processed: 230,012

--- Label Distribution ---

COMPASS directions: 124,806
  NORTHBOUND: 58,911
  SOUTHBOUND: 59,521
  EASTBOUND: 3,010
  WESTBOUND: 3,364

BOROUGH directions: 6,075
  MANHATTAN_BOUND: 2,636
  QUEENS_BOUND: 1,286
  BRONX_BOUND: 785
  BROOKLYN_BOUND: 1,217
  STATENISLAND_BOUND: 151

LOCAL directions: 6,167
  UPTOWN: 3,071
  DOWNTOWN: 3,096

PLACE_BOUND directions: 25,531
BOTH_DIRECTIONS: 43,070
UNSPECIFIED: 48,331

SAMPLE DETECTIONS (First 5 per label)

BOTH_DIRECTIONS:
  1. [189489] L trains are running with delays in both directions after NYPD completed an investigation at 6 Av.
     Spans: ['both directions']
  2. [186126] 6 trains are delayed in both directions while we address a mechanical problem on a train at Brooklyn...
     Spans: ['both directions']
  3. [184474] M103 buses are no longer detoured in either direction after NYPD investigation at Bowery/Grand St co...
     Spans: ['either direction']
  4. [179577] A t

In [25]:
# Analyze single vs multiple directions
print("\n" + "="*70)
print("DIRECTION COUNT ANALYSIS")
print("="*70)

direction_count_stats = defaultdict(int)
direction_count_samples = defaultdict(list)

for idx, row in df_input.iterrows():
    labels = json.loads(row['direction_labels'])
    
    # Count UNSPECIFIED as 0 directions
    if labels == ['UNSPECIFIED']:
        num_directions = 0
    else:
        num_directions = len(labels)
    
    direction_count_stats[num_directions] += 1
    
    # Collect samples (first 5 per count)
    if len(direction_count_samples[num_directions]) < 5:
        direction_count_samples[num_directions].append({
            'record_id': row['Alert ID'],
            'header': row['Header'],
            'num_directions': num_directions,
            'direction_labels': labels
        })

# Print statistics
print(f"\nTotal records: {len(df_input):,}")
print(f"\nDirection count breakdown:")

for count in sorted(direction_count_stats.keys()):
    num_records = direction_count_stats[count]
    percentage = (num_records / len(df_input)) * 100
    direction_word = "direction" if count == 1 else "directions"
    print(f"  {count} {direction_word}: {num_records:,} records ({percentage:.2f}%)")

# Show samples
print("\n" + "="*70)
print("SAMPLE RECORDS BY DIRECTION COUNT")
print("="*70)

for count in sorted(direction_count_samples.keys()):
    samples = direction_count_samples[count]
    direction_word = "Direction" if count == 1 else "Directions"
    print(f"\n{count} {direction_word}:")
    for i, sample in enumerate(samples, 1):
        header_preview = sample['header'][:100] + '...' if len(sample['header']) > 100 else sample['header']
        print(f"  {i}. [{sample['record_id']}] {header_preview}")
        print(f"     Labels: {sample['direction_labels']}")

# Save to CSV
direction_count_output = []
for count in sorted(direction_count_stats.keys()):
    num_records = direction_count_stats[count]
    percentage = (num_records / len(df_input)) * 100
    direction_word = "direction" if count == 1 else "directions"
    direction_count_output.append({
        "Direction_Count": count,
        "Description": f"{count} {direction_word}",
        "Number_of_Records": num_records,
        "Percentage": round(percentage, 2)
    })

# Add summary statistics
zero_direction = direction_count_stats.get(0, 0)
single_direction = direction_count_stats.get(1, 0)
multiple_directions = sum(direction_count_stats.get(k, 0) for k in direction_count_stats.keys() if k > 1)

direction_count_output.append({})
direction_count_output.append({
    "Direction_Count": "Summary",
    "Description": "No direction (UNSPECIFIED)",
    "Number_of_Records": zero_direction,
    "Percentage": round((zero_direction / len(df_input)) * 100, 2)
})
direction_count_output.append({
    "Direction_Count": "Summary",
    "Description": "Single direction",
    "Number_of_Records": single_direction,
    "Percentage": round((single_direction / len(df_input)) * 100, 2)
})
direction_count_output.append({
    "Direction_Count": "Summary",
    "Description": "Multiple directions",
    "Number_of_Records": multiple_directions,
    "Percentage": round((multiple_directions / len(df_input)) * 100, 2)
})

DIRECTION_COUNT_FILE = "EDA/direction_count_analysis.csv"
df_direction_count = pd.DataFrame(direction_count_output)
df_direction_count.to_csv(DIRECTION_COUNT_FILE, index=False)
print(f"\n✓ Direction count analysis saved to {DIRECTION_COUNT_FILE}")



DIRECTION COUNT ANALYSIS

Total records: 230,012

Direction count breakdown:
  0 directions: 48,331 records (21.01%)
  1 direction: 162,882 records (70.81%)
  2 directions: 15,750 records (6.85%)
  3 directions: 1,996 records (0.87%)
  4 directions: 593 records (0.26%)
  5 directions: 232 records (0.10%)
  6 directions: 102 records (0.04%)
  7 directions: 47 records (0.02%)
  8 directions: 26 records (0.01%)
  9 directions: 8 records (0.00%)
  10 directions: 12 records (0.01%)
  11 directions: 10 records (0.00%)
  12 directions: 8 records (0.00%)
  13 directions: 10 records (0.00%)
  14 directions: 5 records (0.00%)

SAMPLE RECORDS BY DIRECTION COUNT

0 Directions:
  1. [180128] A C trains are delayed while we conduct emergency track repairs in Manhattan.
     Labels: ['UNSPECIFIED']
  2. [185055] All M trains will depart from Track 2.
     Labels: ['UNSPECIFIED']
  3. [178094] Q20A buses are no longer detoured after road work on 20th Ave between College Point Blvd and 124th S...
    

## 5. Save Output

Write the labeled data to CSV.


In [24]:
# Save to CSV
print(f"\nWriting output to {OUTPUT_FILE}...")
df_input.to_csv(OUTPUT_FILE, index=False)
print(f"Successfully wrote {len(df_input):,} records to {OUTPUT_FILE}")

# Display first few rows
print("\nFirst 5 rows of output:")
df_input.head()



Writing output to Preprocessed/MTA_Data_silver_directions.csv...
Successfully wrote 230,012 records to Preprocessed/MTA_Data_silver_directions.csv

First 5 rows of output:


Unnamed: 0,Alert ID,Date,Agency,Status Label,Affected,Header,direction_spans,direction_labels
0,180128,11/05/2022 05:58:00 PM,NYCT Subway,delays,"[""A"", ""C""]",A C trains are delayed while we conduct emerge...,[],"[""UNSPECIFIED""]"
1,189489,12/20/2022 07:09:00 PM,NYCT Subway,delays,"[""L""]",L trains are running with delays in both direc...,"[{""text"": ""both directions"", ""start"": 36, ""end...","[""BOTH_DIRECTIONS""]"
2,189321,12/20/2022 12:31:00 AM,NYCT Subway,delays,"[""J""]",Jamaica-bound J trains are delayed while we re...,"[{""text"": ""Jamaica-bound"", ""start"": 0, ""end"": ...","[""PLACE_BOUND""]"
3,188948,12/18/2022 06:12:00 AM,NYCT Subway,delays,"[""Q""]",Southbound Q trains are running with delays af...,"[{""text"": ""Southbound"", ""start"": 0, ""end"": 10}]","[""SOUTHBOUND""]"
4,187749,12/12/2022 02:26:00 PM,NYCT Subway,delays,"[""B"", ""C""]",Southbound B C trains are running with delays ...,"[{""text"": ""Southbound"", ""start"": 0, ""end"": 10}]","[""SOUTHBOUND""]"
