# MTA Direction Auto-Labeler for NER Training

This notebook auto-labels directions in MTA bus and subway alerts using rule-based pattern matching.

## Label Types:
- **COMPASS**: NORTHBOUND, SOUTHBOUND, EASTBOUND, WESTBOUND
- **BOROUGH**: MANHATTAN_BOUND, QUEENS_BOUND, BRONX_BOUND, BROOKLYN_BOUND, STATENISLAND_BOUND
- **LOCAL**: UPTOWN, DOWNTOWN
- **PLACE_BOUND**: Any other location-based direction
- **BOTH_DIRECTIONS**: Both/either direction phrases
- **UNSPECIFIED**: No direction detected


In [9]:
import re
import csv
import json
import pandas as pd
from collections import defaultdict


## 1. Direction Classifier

Classifies detected direction targets into compass, borough, or place-bound labels.


In [10]:
class DirectionClassifier:
    """Classifies direction targets according to precedence rules."""
    
    COMPASS_TERMS = {'north', 'south', 'east', 'west'}
    BOROUGH_TERMS = {'manhattan', 'queens', 'bronx', 'brooklyn', 'staten island'}
    LOCAL_TERMS = {'uptown', 'downtown'}
    
    COMPASS_LABELS = {
        'north': 'NORTHBOUND',
        'south': 'SOUTHBOUND',
        'east': 'EASTBOUND',
        'west': 'WESTBOUND'
    }
    
    BOROUGH_LABELS = {
        'manhattan': 'MANHATTAN_BOUND',
        'queens': 'QUEENS_BOUND',
        'bronx': 'BRONX_BOUND',
        'brooklyn': 'BROOKLYN_BOUND',
        'staten island': 'STATENISLAND_BOUND'
    }
    
    LOCAL_LABELS = {
        'uptown': 'UPTOWN',
        'downtown': 'DOWNTOWN'
    }
    
    @classmethod
    def classify(cls, target_text):
        """
        Classify a direction target into COMPASS, BOROUGH, LOCAL, or PLACE_BOUND.
        
        Args:
            target_text: The extracted target string
        
        Returns:
            str: The normalized direction label
        """
        target_normalized = target_text.strip().lower()
        
        # Precedence 1: Check compass directions
        if target_normalized in cls.COMPASS_TERMS:
            return cls.COMPASS_LABELS[target_normalized]
        
        # Precedence 2: Check borough names
        if target_normalized in cls.BOROUGH_TERMS:
            return cls.BOROUGH_LABELS[target_normalized]
        
        # Precedence 3: Check local directions
        if target_normalized in cls.LOCAL_TERMS:
            return cls.LOCAL_LABELS[target_normalized]
        
        # Precedence 4: Everything else is PLACE_BOUND
        return 'PLACE_BOUND'


## 2. Direction Detector

Implements pattern matching rules for detecting direction mentions.


In [11]:
class DirectionDetector:
    """Detects direction mentions in MTA alert headers."""
    
    # Rejection patterns - be very conservative to avoid false rejections
    # Only reject clearly invalid patterns
    REJECTION_PATTERNS = [
        r'\bin\s+both\s+directions?\s+of\b',  # "in both direction of X" (invalid)
        # Note: "Westbound to Manhattan" is VALID - the direction IS "Westbound"
        # Note: "due to", "because of" appearing AFTER direction are OK
    ]
    
    # Stop words that signal a boundary (lowercase words that are not part of place names)
    # These are common grammatical words that indicate the start of a new phrase
    STOP_WORDS = {
        # Articles
        'a', 'an', 'the',
        # Conjunctions
        'and', 'or', 'but', 'nor',
        # Prepositions
        'to', 'from', 'via', 'at', 'in', 'on', 'for', 'of', 'with', 'by', 'as', 'into',
        # Verbs commonly appearing before directions
        'are', 'is', 'was', 'were', 'will', 'may', 'can', 'could', 'would', 'should',
        'be', 'been', 'being', 'have', 'has', 'had',
        'running', 'operating', 'making', 'skipping', 'stopping', 'ending', 'starting',
        'experience', 'expect', 'wait', 'stop', 'last', 'first', 'next',
        # Quantifiers/determiners
        'some', 'all', 'no', 'any', 'each', 'every', 'this', 'that', 'these', 'those', 'most',
        # Transport-related words
        'trains', 'train', 'buses', 'bus', 'service', 'services', 'shuttle', 'shuttles',
        # Other common words
        'longer', 'shorter', 'local', 'express', 'limited', 'delayed', 'suspended',
        'you', 'your', 'we', 'our', 'they', 'their'
    }
    
    # Known abbreviations that are part of place names (should NOT be treated as boundaries)
    # These can appear with or without periods
    KNOWN_ABBREVIATIONS = {
        'st', 'st.', 'ave', 'av', 'av.', 'ave.', 'sq', 'sq.', 'blvd', 'blvd.',
        'pkwy', 'pkwy.', 'rd', 'rd.', 'pl', 'pl.', 'ct', 'ct.', 'dr', 'dr.',
        'hwy', 'hwy.', 'jct', 'jct.', 'ctr', 'ctr.', 'pk', 'pk.',
        'sts', 'avs'  # plurals
    }
    
    def __init__(self):
        """Initialize the detector with compiled patterns."""
        self.classifier = DirectionClassifier()
        
        # Specific patterns for boroughs (highest priority)
        # Allow matching after non-letters (spaces, digits, punctuation, newlines, etc.)
        # Use MULTILINE flag so ^ matches at start of lines (after \n)
        # Use (?:\b|(?=[A-Z0-9])) to match cases like "queensboundQ42"
        self.borough_patterns = [
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Manhattan[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'MANHATTAN_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Queens[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'QUEENS_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Bronx[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'BRONX_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Brooklyn[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'BROOKLYN_BOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))Staten\s+Island[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'STATENISLAND_BOUND'),
        ]
        
        # Specific patterns for local directions (Uptown/Downtown)
        self.local_patterns = [
            (re.compile(r'\bUptown\b', re.IGNORECASE), 'UPTOWN'),
            (re.compile(r'\bDowntown\b', re.IGNORECASE), 'DOWNTOWN'),
        ]
        
        # Specific patterns for compass directions
        # Use MULTILINE flag so ^ matches after newlines (e.g., "\nWestbound")
        # Use (?:\b|(?=[A-Z0-9])) instead of \b to match "westboundQ42" (no space before route code)
        self.compass_patterns = [
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))North[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'NORTHBOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))South[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'SOUTHBOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))East[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'EASTBOUND'),
            (re.compile(r'(?:^|(?<=[^a-zA-Z]))West[-\s]?bound(?:\b|(?=[A-Z0-9]))', re.IGNORECASE | re.MULTILINE), 'WESTBOUND'),
        ]
        
        # Place-bound pattern: find "-bound" or " bound" and we'll expand leftward
        # This pattern finds the "bound" keyword with its immediate predecessor
        self.place_bound_anchor_pattern = re.compile(
            r'[-\s]bound\b',
            re.IGNORECASE
        )
        
        # R2: Both/either direction(s)
        self.both_either_pattern = re.compile(
            r'\b(?:both|either)\s+directions?\b',
            re.IGNORECASE
        )
    
    def _is_abbreviation(self, word):
        """Check if a word is a known abbreviation."""
        word_clean = word.lower().rstrip('.')
        return word_clean in self.KNOWN_ABBREVIATIONS or word.lower() in self.KNOWN_ABBREVIATIONS
    
    def _is_stop_word(self, word):
        """Check if a word is a stop word (boundary indicator)."""
        return word.lower() in self.STOP_WORDS
    
    def _tokenize_leftward(self, text, end_pos):
        """
        Tokenize text going leftward from end_pos.
        Returns list of tokens in left-to-right order with their positions.
        
        Each token is a tuple: (start_pos, end_pos, token_text, is_connected)
        where is_connected indicates if it's dash/slash connected to the next token.
        """
        tokens = []
        pos = end_pos
        prev_connected = False  # Was the previous token connected by dash/slash?
        
        while pos > 0:
            # Skip whitespace
            while pos > 0 and text[pos - 1] == ' ':
                pos -= 1
            
            if pos == 0:
                break
            
            # Check for sentence boundary (comma, period not part of abbreviation, newline)
            if text[pos - 1] == '\n':
                break
            
            if text[pos - 1] == ',':
                break
            
            # Check for dash or slash (connector)
            if text[pos - 1] in '-/':
                prev_connected = True
                pos -= 1
                continue
            
            # Extract the token (word, number, or abbreviation with period)
            token_end = pos
            token_start = pos
            
            # Include alphanumeric characters and periods (for abbreviations)
            while token_start > 0 and (text[token_start - 1].isalnum() or text[token_start - 1] == '.'):
                # Check if period is part of abbreviation
                if text[token_start - 1] == '.':
                    # Look back to see if this could be an abbreviation
                    temp_start = token_start - 1
                    while temp_start > 0 and text[temp_start - 1].isalpha():
                        temp_start -= 1
                    potential_abbrev = text[temp_start:token_start - 1]
                    if potential_abbrev and self._is_abbreviation(potential_abbrev):
                        token_start = temp_start
                        break
                    else:
                        # Period is a sentence boundary
                        break
                token_start -= 1
            
            if token_start < token_end:
                token_text = text[token_start:token_end]
                tokens.append((token_start, token_end, token_text, prev_connected))
                pos = token_start
                prev_connected = False
            else:
                break
        
        # Reverse to get left-to-right order
        tokens.reverse()
        return tokens
    
    def _find_place_bound_start(self, text, bound_start):
        """
        Find the start position of a place name by scanning leftward from "bound".
        
        Strategy:
        1. Tokenize leftward from bound_start
        2. Find the rightmost stop word - everything after it is the place name
        3. Handle dash-connected sequences specially (they stay together)
        4. Numbers connected by dash to next token are included
        
        Args:
            text: Full header text
            bound_start: Position where "-bound" or " bound" starts
            
        Returns:
            int: Start position of the place name
        """
        # Tokenize leftward
        tokens = self._tokenize_leftward(text, bound_start)
        
        if not tokens:
            return bound_start
        
        # Find the boundary - scan from left to right
        # The place name starts after the last stop word (that's not connected by dash)
        place_start_idx = 0  # Start from the first token by default
        
        for i, (start, end, token_text, is_connected) in enumerate(tokens):
            # Extract the base word (without dash/slash parts)
            base_word = token_text.split('-')[0].split('/')[0]
            
            # If this is a stop word and NOT dash-connected to the next token
            if self._is_stop_word(base_word) and not is_connected:
                # The place name starts from the next token
                place_start_idx = i + 1
            
            # If this is a standalone number (not connected by dash to next)
            # and not part of a place name pattern like "34 St"
            if base_word.isdigit() and not is_connected:
                # Check if next token is an abbreviation (like "St", "Av")
                if i + 1 < len(tokens):
                    next_token = tokens[i + 1][2].split('-')[0].split('/')[0]
                    if not self._is_abbreviation(next_token):
                        # Standalone number, place name starts after
                        place_start_idx = i + 1
                else:
                    # Number at the end, probably a boundary
                    place_start_idx = i + 1
        
        # Get the start position
        if place_start_idx < len(tokens):
            return tokens[place_start_idx][0]
        else:
            return bound_start
    
    def _check_rejection_context(self, text, match_start, match_end):
        """
        Check if match appears in a rejected context.
        
        Only checks BEFORE the match, not after, to avoid false rejections.
        For example, "Westbound buses are detoured due to..." should NOT be rejected
        just because "due to" appears after the direction mention.
        
        Args:
            text: Full header text
            match_start: Start position of match
            match_end: End position of match
            
        Returns:
            bool: True if should reject, False otherwise
        """
        # Only check context BEFORE the match (within the match itself)
        # This catches cases like "bound from X" or "bound to Y" 
        # but not "Westbound buses... due to X" which is valid
        context_start = max(0, match_start - 10)
        context = text[context_start:match_end + 5]  # Small window after for immediate context
        
        for pattern in self.REJECTION_PATTERNS:
            if re.search(pattern, context, re.IGNORECASE):
                return True
        
        return False
    

    def _has_overlap(self, start, end, detected_ranges):
        """Check if range [start, end) overlaps with any detected range."""
        for ds, de in detected_ranges:
            # Check for any overlap
            if not (end <= ds or start >= de):
                return True
        return False
    
    def detect_place_bound_directions(self, text, detected_ranges):
        """
        Detect place-bound directions with improved boundary detection.
        
        Args:
            text: Header text to analyze
            detected_ranges: Already detected ranges to avoid overlap (will be modified)
            
        Returns:
            list: List of detection dicts
        """
        detections = []
        
        # Find all "-bound" or " bound" occurrences
        for match in self.place_bound_anchor_pattern.finditer(text):
            bound_start = match.start()  # Position of "-" or " " before "bound"
            bound_end = match.end()      # Position after "bound"
            
            # Skip if this "bound" overlaps with already detected ranges
            if self._has_overlap(bound_start, bound_end, detected_ranges):
                continue
            
            # Find the start of the place name by scanning leftward
            place_start = self._find_place_bound_start(text, bound_start)
            
            # Skip if place_start equals bound_start (no place name found)
            if place_start >= bound_start:
                continue
            
            # Check if the full detected range overlaps with already detected
            if self._has_overlap(place_start, bound_end, detected_ranges):
                continue
            
            # Check rejection patterns
            if self._check_rejection_context(text, place_start, bound_end):
                continue
            
            # Extract the full matched text
            full_match = text[place_start:bound_end]
            
            # Add to detected ranges so subsequent detections don't overlap
            detected_ranges.append((place_start, bound_end))
            
            detections.append({
                'text': full_match,
                'start': place_start,
                'end': bound_end,
                'label': 'PLACE_BOUND'
            })
        
        return detections
    
    def detect_bound_directions(self, text):
        """
        Detect X-bound / X bound patterns.
        
        Uses priority order: Borough > Local > Compass > Place-bound
        
        Args:
            text: Header text to analyze
            
        Returns:
            list: List of detection dicts
        """
        detections = []
        detected_ranges = []  # Track detected ranges to avoid overlaps
        
        # Priority 1: Check for borough directions
        for pattern, label in self.borough_patterns:
            for match in pattern.finditer(text):
                start_pos = match.start()
                end_pos = match.end()
                
                # Skip if overlaps with already detected
                if self._has_overlap(start_pos, end_pos, detected_ranges):
                    continue
                
                # Check rejection patterns
                if self._check_rejection_context(text, start_pos, end_pos):
                    continue
                
                detected_ranges.append((start_pos, end_pos))
                detections.append({
                    'text': match.group(0),
                    'start': start_pos,
                    'end': end_pos,
                    'label': label
                })
        
        # Priority 2: Check for local directions
        for pattern, label in self.local_patterns:
            for match in pattern.finditer(text):
                start_pos = match.start()
                end_pos = match.end()
                
                # Skip if overlaps with already detected
                if self._has_overlap(start_pos, end_pos, detected_ranges):
                    continue
                
                # Check rejection patterns
                if self._check_rejection_context(text, start_pos, end_pos):
                    continue
                
                detected_ranges.append((start_pos, end_pos))
                detections.append({
                    'text': match.group(0),
                    'start': start_pos,
                    'end': end_pos,
                    'label': label
                })
        
        # Priority 3: Check for compass directions
        for pattern, label in self.compass_patterns:
            for match in pattern.finditer(text):
                start_pos = match.start()
                end_pos = match.end()
                
                # Skip if overlaps with already detected
                if self._has_overlap(start_pos, end_pos, detected_ranges):
                    continue
                
                # Check rejection patterns
                if self._check_rejection_context(text, start_pos, end_pos):
                    continue
                
                detected_ranges.append((start_pos, end_pos))
                detections.append({
                    'text': match.group(0),
                    'start': start_pos,
                    'end': end_pos,
                    'label': label
                })
        
        # Priority 4: Check for place-bound directions with improved boundary detection
        place_bound_detections = self.detect_place_bound_directions(text, detected_ranges)
        detections.extend(place_bound_detections)
        
        return detections
    
    def detect_both_either(self, text):
        """
        Detect "both/either direction(s)" patterns (R2).
        
        Args:
            text: Header text to analyze
            
        Returns:
            list: List of detection dicts
        """
        detections = []
        
        for match in self.both_either_pattern.finditer(text):
            matched_text = match.group(0)
            
            detections.append({
                'text': matched_text,
                'start': match.start(),
                'end': match.end(),
                'label': 'BOTH_DIRECTIONS'
            })
        
        return detections
    
    def detect_all(self, text):
        """
        Run all detection rules on text.
        
        Args:
            text: Header text to analyze
            
        Returns:
            dict: Detection results
        """
        all_detections = []
        
        # R1: Bound patterns
        bound_detections = self.detect_bound_directions(text)
        all_detections.extend(bound_detections)
        
        # R2: Both/either
        both_either_detections = self.detect_both_either(text)
        all_detections.extend(both_either_detections)
        
        return {'detections': all_detections}

## 3. Process Dataset

Load the input data and apply direction detection.


# Configuration
INPUT_FILE = 'Preprocessed/MTA_Data_preprocessed.csv'
OUTPUT_FILE = 'Preprocessed/MTA_Data_silver_directions.csv'

# Initialize detector
detector = DirectionDetector()


In [12]:
# Configuration
INPUT_FILE = 'Preprocessed/MTA_Data_preprocessed_routespans.csv'
OUTPUT_FILE = 'Preprocessed/MTA_Data_silver_directions.csv'

# Load input data
print(f"Reading {INPUT_FILE}...")
df_input = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df_input):,} records")

# Initialize detector
detector = DirectionDetector()

Reading Preprocessed/MTA_Data_preprocessed_routespans.csv...
Loaded 226,162 records


In [13]:
# Process all headers
print("Processing headers...")

# Initialize new columns for the dataframe
direction_spans_list = []
direction_list = []

for idx, row in df_input.iterrows():
    if (idx + 1) % 10000 == 0:
        print(f"  Processed {idx+1:,} / {len(df_input):,} records...")
    
    header_text = row['header']
    
    # Handle missing or invalid headers (NaN, float, etc.)
    if pd.isna(header_text):
        direction_spans_list.append(json.dumps([]))
        direction_list.append(json.dumps(['UNSPECIFIED']))
        continue
    
    # Convert to string if not already
    header_text = str(header_text)
    
    # Detect directions
    result = detector.detect_all(header_text)
    detections = result['detections']
    
    # Build output
    if detections:
        direction_spans = []
        direction_labels = []
        
        for detection in detections:
            direction_spans.append({
                'start': detection['start'],
                'end': detection['end'],
                'type': 'DIRECTION',
                'value': detection['label']
            })
            direction_labels.append(detection['label'])
        
        direction_spans_list.append(json.dumps(direction_spans))
        direction_list.append(json.dumps(direction_labels))
    else:
        # No direction detected - label as UNSPECIFIED
        direction_spans_list.append(json.dumps([]))
        direction_list.append(json.dumps(['UNSPECIFIED']))

# Add new columns to the dataframe - direction first, then direction_spans
df_input['direction'] = direction_list
df_input['direction_spans'] = direction_spans_list

print(f"Completed processing {len(df_input):,} records")

Processing headers...
  Processed 10,000 / 226,162 records...
  Processed 20,000 / 226,162 records...
  Processed 30,000 / 226,162 records...
  Processed 40,000 / 226,162 records...
  Processed 50,000 / 226,162 records...
  Processed 60,000 / 226,162 records...
  Processed 70,000 / 226,162 records...
  Processed 80,000 / 226,162 records...
  Processed 90,000 / 226,162 records...
  Processed 100,000 / 226,162 records...
  Processed 110,000 / 226,162 records...
  Processed 120,000 / 226,162 records...
  Processed 130,000 / 226,162 records...
  Processed 140,000 / 226,162 records...
  Processed 150,000 / 226,162 records...
  Processed 160,000 / 226,162 records...
  Processed 170,000 / 226,162 records...
  Processed 180,000 / 226,162 records...
  Processed 190,000 / 226,162 records...
  Processed 200,000 / 226,162 records...
  Processed 210,000 / 226,162 records...
  Processed 220,000 / 226,162 records...
Completed processing 226,162 records


## 4. Generate Statistics

Analyze the detection results and show samples.


In [14]:
# Generate statistics
stats = {
    'total_records': len(df_input),
    'label_counts': defaultdict(int)
}

samples_by_label = defaultdict(list)

for idx, row in df_input.iterrows():
    labels = json.loads(row['direction'])
    spans = json.loads(row['direction_spans'])
    
    for label in labels:
        stats['label_counts'][label] += 1
        
        # Collect samples (first 10 per label)
        if len(samples_by_label[label]) < 10:
            samples_by_label[label].append({
                'record_id': row['alert_id'],
                'header': row['header'],
                'spans': spans
            })

# Print statistics
print("\n" + "="*70)
print("DIRECTION LABELING STATISTICS")
print("="*70)

print(f"\nTotal records processed: {stats['total_records']:,}")

print("\n--- Label Distribution ---")

# Group by type
compass_labels = ['NORTHBOUND', 'SOUTHBOUND', 'EASTBOUND', 'WESTBOUND']
borough_labels = ['MANHATTAN_BOUND', 'QUEENS_BOUND', 'BRONX_BOUND', 'BROOKLYN_BOUND', 'STATENISLAND_BOUND']
local_labels = ['UPTOWN', 'DOWNTOWN']

compass_count = sum(stats['label_counts'][l] for l in compass_labels)
borough_count = sum(stats['label_counts'][l] for l in borough_labels)
local_count = sum(stats['label_counts'][l] for l in local_labels)
place_count = stats['label_counts']['PLACE_BOUND']
both_count = stats['label_counts']['BOTH_DIRECTIONS']
unspecified_count = stats['label_counts']['UNSPECIFIED']

print(f"\nCOMPASS directions: {compass_count:,}")
for label in compass_labels:
    count = stats['label_counts'][label]
    if count > 0:
        print(f"  {label}: {count:,}")

print(f"\nBOROUGH directions: {borough_count:,}")
for label in borough_labels:
    count = stats['label_counts'][label]
    if count > 0:
        print(f"  {label}: {count:,}")

print(f"\nLOCAL directions: {local_count:,}")
for label in local_labels:
    count = stats['label_counts'][label]
    if count > 0:
        print(f"  {label}: {count:,}")

print(f"\nPLACE_BOUND directions: {place_count:,}")
print(f"BOTH_DIRECTIONS: {both_count:,}")
print(f"UNSPECIFIED: {unspecified_count:,}")

print("\n" + "="*70)
print("SAMPLE DETECTIONS (First 5 per label)")
print("="*70)

for label in sorted(samples_by_label.keys()):
    samples = samples_by_label[label][:5]
    print(f"\n{label}:")
    for i, sample in enumerate(samples, 1):
        header_preview = sample['header'][:100] + '...' if len(sample['header']) > 100 else sample['header']
        print(f"  {i}. [{sample['record_id']}] {header_preview}")
        if sample['spans']:
            span_values = [s['value'] for s in sample['spans']]
            print(f"     Spans: {span_values}")

# Save statistics to CSV file
stats_output = []
stats_output.append({"Category": "Total Records", "Count": stats["total_records"]})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "COMPASS Directions", "Count": compass_count})
for label in compass_labels:
    count = stats["label_counts"][label]
    if count > 0:
        stats_output.append({"Category": f"  {label}", "Count": count})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "BOROUGH Directions", "Count": borough_count})
for label in borough_labels:
    count = stats["label_counts"][label]
    if count > 0:
        stats_output.append({"Category": f"  {label}", "Count": count})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "LOCAL Directions", "Count": local_count})
for label in local_labels:
    count = stats["label_counts"][label]
    if count > 0:
        stats_output.append({"Category": f"  {label}", "Count": count})
stats_output.append({"Category": "", "Count": ""})
stats_output.append({"Category": "PLACE_BOUND Directions", "Count": place_count})
stats_output.append({"Category": "BOTH_DIRECTIONS", "Count": both_count})
stats_output.append({"Category": "UNSPECIFIED", "Count": unspecified_count})

# Save to CSV
STATS_OUTPUT_FILE = "EDA/direction_labeling_statistics.csv"
df_stats = pd.DataFrame(stats_output)
df_stats.to_csv(STATS_OUTPUT_FILE, index=False)
print(f"\n✓ Statistics saved to {STATS_OUTPUT_FILE}")


DIRECTION LABELING STATISTICS

Total records processed: 226,162

--- Label Distribution ---

COMPASS directions: 124,704
  NORTHBOUND: 58,870
  SOUTHBOUND: 59,491
  EASTBOUND: 3,003
  WESTBOUND: 3,340

BOROUGH directions: 5,989
  MANHATTAN_BOUND: 2,577
  QUEENS_BOUND: 1,283
  BRONX_BOUND: 784
  BROOKLYN_BOUND: 1,212
  STATENISLAND_BOUND: 133

LOCAL directions: 6,149
  UPTOWN: 3,070
  DOWNTOWN: 3,079

PLACE_BOUND directions: 25,530
BOTH_DIRECTIONS: 42,822
UNSPECIFIED: 44,906

SAMPLE DETECTIONS (First 5 per label)

BOTH_DIRECTIONS:
  1. [189489] L trains are running with delays in both directions after NYPD completed an investigation at 6 Av.
     Spans: ['BOTH_DIRECTIONS']
  2. [186126] 6 trains are delayed in both directions while we address a mechanical problem on a train at Brooklyn...
     Spans: ['BOTH_DIRECTIONS']
  3. [184474] M103 buses are no longer detoured in either direction after NYPD investigation at Bowery/Grand St co...
     Spans: ['BOTH_DIRECTIONS']
  4. [179577] A tr

In [15]:
# Analyze single vs multiple directions
print("\n" + "="*70)
print("DIRECTION COUNT ANALYSIS")
print("="*70)

direction_count_stats = defaultdict(int)
direction_count_samples = defaultdict(list)

for idx, row in df_input.iterrows():
    labels = json.loads(row['direction'])
    
    # Count UNSPECIFIED as 0 directions
    if labels == ['UNSPECIFIED']:
        num_directions = 0
    else:
        num_directions = len(labels)
    
    direction_count_stats[num_directions] += 1
    
    # Collect samples (first 5 per count)
    if len(direction_count_samples[num_directions]) < 5:
        direction_count_samples[num_directions].append({
            'record_id': row['alert_id'],
            'header': row['header'],
            'num_directions': num_directions,
            'direction': labels
        })

# Print statistics
print(f"\nTotal records: {len(df_input):,}")
print(f"\nDirection count breakdown:")

for count in sorted(direction_count_stats.keys()):
    num_records = direction_count_stats[count]
    percentage = (num_records / len(df_input)) * 100
    direction_word = "direction" if count == 1 else "directions"
    print(f"  {count} {direction_word}: {num_records:,} records ({percentage:.2f}%)")

# Show samples
print("\n" + "="*70)
print("SAMPLE RECORDS BY DIRECTION COUNT")
print("="*70)

for count in sorted(direction_count_samples.keys()):
    samples = direction_count_samples[count]
    direction_word = "Direction" if count == 1 else "Directions"
    print(f"\n{count} {direction_word}:")
    for i, sample in enumerate(samples, 1):
        header_preview = sample['header'][:100] + '...' if len(sample['header']) > 100 else sample['header']
        print(f"  {i}. [{sample['record_id']}] {header_preview}")
        print(f"     Labels: {sample['direction']}")

# Save to CSV
direction_count_output = []
for count in sorted(direction_count_stats.keys()):
    num_records = direction_count_stats[count]
    percentage = (num_records / len(df_input)) * 100
    direction_word = "direction" if count == 1 else "directions"
    direction_count_output.append({
        "Direction_Count": count,
        "Description": f"{count} {direction_word}",
        "Number_of_Records": num_records,
        "Percentage": round(percentage, 2)
    })

# Add summary statistics
zero_direction = direction_count_stats.get(0, 0)
single_direction = direction_count_stats.get(1, 0)
multiple_directions = sum(direction_count_stats.get(k, 0) for k in direction_count_stats.keys() if k > 1)

direction_count_output.append({})
direction_count_output.append({
    "Direction_Count": "Summary",
    "Description": "No direction (UNSPECIFIED)",
    "Number_of_Records": zero_direction,
    "Percentage": round((zero_direction / len(df_input)) * 100, 2)
})
direction_count_output.append({
    "Direction_Count": "Summary",
    "Description": "Single direction",
    "Number_of_Records": single_direction,
    "Percentage": round((single_direction / len(df_input)) * 100, 2)
})
direction_count_output.append({
    "Direction_Count": "Summary",
    "Description": "Multiple directions",
    "Number_of_Records": multiple_directions,
    "Percentage": round((multiple_directions / len(df_input)) * 100, 2)
})

DIRECTION_COUNT_FILE = "EDA/direction_count_analysis.csv"
df_direction_count = pd.DataFrame(direction_count_output)
df_direction_count.to_csv(DIRECTION_COUNT_FILE, index=False)
print(f"\n✓ Direction count analysis saved to {DIRECTION_COUNT_FILE}")


DIRECTION COUNT ANALYSIS

Total records: 226,162

Direction count breakdown:
  0 directions: 44,906 records (19.86%)
  1 direction: 162,484 records (71.84%)
  2 directions: 15,727 records (6.95%)
  3 directions: 1,991 records (0.88%)
  4 directions: 594 records (0.26%)
  5 directions: 232 records (0.10%)
  6 directions: 102 records (0.05%)
  7 directions: 47 records (0.02%)
  8 directions: 26 records (0.01%)
  9 directions: 8 records (0.00%)
  10 directions: 12 records (0.01%)
  11 directions: 10 records (0.00%)
  12 directions: 8 records (0.00%)
  13 directions: 10 records (0.00%)
  14 directions: 5 records (0.00%)

SAMPLE RECORDS BY DIRECTION COUNT

0 Directions:
  1. [180128] A C trains are delayed while we conduct emergency track repairs in Manhattan.
     Labels: ['UNSPECIFIED']
  2. [185055] All M trains will depart from Track 2.
     Labels: ['UNSPECIFIED']
  3. [178094] Q20A buses are no longer detoured after road work on 20th Ave between College Point Blvd and 124th S...
    

In [16]:
# Export PLACE_BOUND detections to separate CSV for analysis
print("\n" + "="*70)
print("PLACE_BOUND DETECTION ANALYSIS")
print("="*70)

place_bound_records = []

for idx, row in df_input.iterrows():
    spans = json.loads(row['direction_spans'])
    header = row['header'] if pd.notna(row['header']) else ""
    
    # Get all PLACE_BOUND spans for this record
    place_bound_spans = [s for s in spans if s['value'] == 'PLACE_BOUND']
    
    if not place_bound_spans:
        continue
    
    # Get all PLACE_BOUND span texts
    all_span_texts = []
    max_word_count = 0
    has_lowercase_start = False
    
    for s in place_bound_spans:
        span_text = header[s['start']:s['end']] if header else ""
        all_span_texts.append(span_text)
        
        # Calculate word count for this span
        place_part = span_text.rsplit('-', 1)[0] if '-bound' in span_text.lower() else span_text.rsplit(' ', 1)[0]
        word_count = len([w for w in re.split(r'[-\s/]+', place_part) if w])
        max_word_count = max(max_word_count, word_count)
        
        # Check for lowercase start (only flag if starts with lowercase letter, not digit)
        if span_text and span_text[0].islower():
            has_lowercase_start = True
    
    # Determine potential issue
    potential_issue = ""
    if max_word_count > 5:
        potential_issue = "LONG_SPAN"
    elif has_lowercase_start:
        potential_issue = "LOWERCASE_START"
    
    place_bound_records.append({
        'alert_id': row['alert_id'],
        'header': header,
        'all_place_bound_spans': json.dumps(all_span_texts),
        'potential_issue': potential_issue
    })

# Create DataFrame and save
df_place_bound = pd.DataFrame(place_bound_records)

# Display summary
print(f"\nTotal records with PLACE_BOUND: {len(df_place_bound):,}")

if len(df_place_bound) > 0:
    # Count records by number of spans
    print("\n--- PLACE_BOUND Count per Record ---")
    df_place_bound['num_spans'] = df_place_bound['all_place_bound_spans'].apply(lambda x: len(json.loads(x)))
    span_counts = df_place_bound['num_spans'].value_counts().sort_index()
    for num_spans, count in span_counts.items():
        print(f"  {num_spans} PLACE_BOUND(s): {count:,} records")
    
    # Potential issues
    issue_counts = df_place_bound['potential_issue'].value_counts()
    print("\n--- Potential Issues ---")
    no_issue_count = len(df_place_bound[df_place_bound['potential_issue'] == ""])
    print(f"  No issues: {no_issue_count:,}")
    for issue, count in issue_counts.items():
        if issue:
            print(f"  {issue}: {count:,}")
    
    # Show samples with multiple PLACE_BOUND in same record
    multi_pb_df = df_place_bound[df_place_bound['num_spans'] > 1]
    if len(multi_pb_df) > 0:
        print(f"\n--- Samples with Multiple PLACE_BOUND (first 5 records) ---")
        for i, (_, row) in enumerate(multi_pb_df.head(5).iterrows(), 1):
            header_preview = row['header'][:100] + '...' if len(row['header']) > 100 else row['header']
            print(f"  {i}. {header_preview}")
            print(f"     Spans: {row['all_place_bound_spans']}")
    
    # Show samples with potential issues
    issues_df = df_place_bound[df_place_bound['potential_issue'] != ""]
    if len(issues_df) > 0:
        print(f"\n--- Samples with Potential Issues (first 10) ---")
        for i, (_, row) in enumerate(issues_df.head(10).iterrows(), 1):
            header_preview = row['header'][:80] + '...' if len(row['header']) > 80 else row['header']
            print(f"  {i}. [{row['potential_issue']}] {header_preview}")
            print(f"     Spans: {row['all_place_bound_spans']}")
    
    # Drop helper column before saving
    df_place_bound = df_place_bound.drop(columns=['num_spans'])

# Save to CSV
PLACE_BOUND_OUTPUT_FILE = "EDA/place_bound_detections.csv"
df_place_bound.to_csv(PLACE_BOUND_OUTPUT_FILE, index=False)
print(f"\n✓ PLACE_BOUND detections saved to {PLACE_BOUND_OUTPUT_FILE}")


PLACE_BOUND DETECTION ANALYSIS

Total records with PLACE_BOUND: 21,287

--- PLACE_BOUND Count per Record ---
  1 PLACE_BOUND(s): 17,440 records
  2 PLACE_BOUND(s): 3,575 records
  3 PLACE_BOUND(s): 182 records
  4 PLACE_BOUND(s): 70 records
  5 PLACE_BOUND(s): 14 records
  6 PLACE_BOUND(s): 2 records
  7 PLACE_BOUND(s): 2 records
  9 PLACE_BOUND(s): 2 records

--- Potential Issues ---
  No issues: 21,174
  LOWERCASE_START: 95
  LONG_SPAN: 18

--- Samples with Multiple PLACE_BOUND (first 5 records) ---
  1. Broad St-bound J and Forest Hills-bound M trains are running with delays after we removed a train wi...
     Spans: ["Broad St-bound", "Forest Hills-bound"]
  2. Broad St-bound J and Forest Hills-bound M trains are delayed while EMS responds to someone in need a...
     Spans: ["Broad St-bound", "Forest Hills-bound"]
  3. Broad St-bound J and Forest Hills-bound M trains are running with delays after EMS responded to some...
     Spans: ["Broad St-bound", "Forest Hills-bound"]
  4. J

## 5. Save Output

Write the labeled data to CSV.


In [17]:
# Save to CSV
print(f"\nWriting output to {OUTPUT_FILE}...")
df_input.to_csv(OUTPUT_FILE, index=False)
print(f"Successfully wrote {len(df_input):,} records to {OUTPUT_FILE}")

# Display first few rows
print("\nFirst 5 rows of output:")
df_input.head()


Writing output to Preprocessed/MTA_Data_silver_directions.csv...
Successfully wrote 226,162 records to Preprocessed/MTA_Data_silver_directions.csv

First 5 rows of output:


Unnamed: 0,alert_id,date,agency,status_label,affected,affected_spans,header,direction,direction_spans
0,180128,11/05/2022 05:58:00 PM,NYCT Subway,delays,"[""A"", ""C""]","[{""start"": 0, ""end"": 1, ""type"": ""ROUTE"", ""valu...",A C trains are delayed while we conduct emerge...,"[""UNSPECIFIED""]",[]
1,189489,12/20/2022 07:09:00 PM,NYCT Subway,delays,"[""L""]","[{""start"": 0, ""end"": 1, ""type"": ""ROUTE"", ""valu...",L trains are running with delays in both direc...,"[""BOTH_DIRECTIONS""]","[{""start"": 36, ""end"": 51, ""type"": ""DIRECTION"",..."
2,189321,12/20/2022 12:31:00 AM,NYCT Subway,delays,"[""J""]","[{""start"": 14, ""end"": 15, ""type"": ""ROUTE"", ""va...",Jamaica-bound J trains are delayed while we re...,"[""PLACE_BOUND""]","[{""start"": 0, ""end"": 13, ""type"": ""DIRECTION"", ..."
3,188948,12/18/2022 06:12:00 AM,NYCT Subway,delays,"[""Q""]","[{""start"": 11, ""end"": 12, ""type"": ""ROUTE"", ""va...",Southbound Q trains are running with delays af...,"[""SOUTHBOUND""]","[{""start"": 0, ""end"": 10, ""type"": ""DIRECTION"", ..."
4,187749,12/12/2022 02:26:00 PM,NYCT Subway,delays,"[""B"", ""C""]","[{""start"": 11, ""end"": 12, ""type"": ""ROUTE"", ""va...",Southbound B C trains are running with delays ...,"[""SOUTHBOUND""]","[{""start"": 0, ""end"": 10, ""type"": ""DIRECTION"", ..."
