# IMDb Knowledge Graph Builder for TransE Training

This notebook builds a Knowledge Graph from IMDb TSV files stored in Google Drive.

**Modes:**
- **Full KG**: Processes all movies (produces ~81M triples, ~16.5M entities)
- **Filtered KG**: Filters movies by year, type, and top billing (target: 5-10M triples)

**Entities:**
- Movies (tconst)
- Persons (nconst)
- Genres (from title.basics genres column)

**Relations:**
- (movie) --HAS_GENRE--> (genre)
- (person) --DIRECTED--> (movie)
- (person) --WROTE--> (movie)
- (person) --ACTED_IN--> (movie)

In [None]:
# Install required packages (if not already installed)
!pip install pandas tqdm -q

In [1]:
# Configuration
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import os
import random

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# ============================================================================
# PIPELINE MODE SELECTION
# ============================================================================
USE_FILTERED_MODE = True  # Set to False for full KG, True for filtered KG

# ============================================================================
# FILTERING CONFIGURATION (only used if USE_FILTERED_MODE = True)
# ============================================================================
YEAR_MIN = 1970  # Keep only movies with startYear >= YEAR_MIN
TOP_BILLING = 5  # Keep only actors/actresses with ordering <= TOP_BILLING

# ============================================================================
# DRY-RUN MODE (for quick testing)
# ============================================================================
LIMIT_ROWS = None  # Set to integer (e.g., 10000) to limit rows per file for testing
LIMIT_MOVIES = None  # Alternative: limit number of movies to process (after filtering)

# ============================================================================
# FULL KG STATISTICS (for comparison - update if you know the full KG stats)
# ============================================================================
FULL_KG_STATS = {
    'movies': 16500000,  # Approximate - update with actual if known
    'triples': 81000000,  # Approximate - update with actual if known
    'entities': 16500000  # Approximate - update with actual if known
}

# ============================================================================
# PATHS AND FILES
# ============================================================================
DRIVE_ROOT = '/content/drive/MyDrive'
INPUT_DIR = f'{DRIVE_ROOT}/Knowledge Graph'  # Folder containing TSV files

# Output directory based on mode
if USE_FILTERED_MODE:
    OUTPUT_DIR = f'{DRIVE_ROOT}/kg_output_filtered'
else:
    OUTPUT_DIR = f'{DRIVE_ROOT}/kg_output'

# File names
FILES = {
    'basics': 'title.basics.tsv',
    'crew': 'title.crew.tsv',
    'principals': 'title.principals.tsv',
    'ratings': 'title.ratings.tsv'  # Not used for triples but can be read
}

print("=" * 60)
print("CONFIGURATION")
print("=" * 60)
print(f"Mode: {'FILTERED KG' if USE_FILTERED_MODE else 'FULL KG'}")
print(f"Input directory: {INPUT_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
if USE_FILTERED_MODE:
    print(f"  Year minimum: {YEAR_MIN}")
    print(f"  Top billing limit: {TOP_BILLING}")
print(f"Dry-run limit rows: {LIMIT_ROWS if LIMIT_ROWS else 'None'}")
print(f"Dry-run limit movies: {LIMIT_MOVIES if LIMIT_MOVIES else 'None'}")
print("=" * 60)


CONFIGURATION
Mode: FILTERED KG
Input directory: /content/drive/MyDrive/Knowledge Graph
Output directory: /content/drive/MyDrive/kg_output_filtered
  Year minimum: 1970
  Top billing limit: 5
Dry-run limit rows: None
Dry-run limit movies: None


In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory ready: {OUTPUT_DIR}")


Mounted at /content/drive
Output directory ready: /content/drive/MyDrive/kg_output_filtered


In [3]:
# Helper functions

def safe_split(value, sep=','):
    """Safely split a value, handling missing values marked as \\N"""
    if pd.isna(value) or value == '\\N' or value == '':
        return []
    return [v.strip() for v in str(value).split(sep) if v.strip() and v.strip() != '\\N']

def read_tsv_chunked(filepath, limit=None, chunksize=100000):
    """Read TSV file with optional row limit and chunking"""
    if limit:
        # For dry-run, read only first N rows
        df = pd.read_csv(filepath, sep='\t', nrows=limit, low_memory=False)
        return [df]  # Return as single chunk
    else:
        # Full processing with chunking
        return pd.read_csv(filepath, sep='\t', chunksize=chunksize, low_memory=False)

def get_file_path(filename):
    """Get full path for a file in INPUT_DIR"""
    return os.path.join(INPUT_DIR, filename)

print("Helper functions defined")


Helper functions defined


In [4]:
# Step 1: Extract entities and build triples from title.basics.tsv
# Entities: movies (tconst), genres
# Relations: (movie) --HAS_GENRE--> (genre)
# Apply filtering if USE_FILTERED_MODE is True

print("=" * 60)
print("Step 1: Processing title.basics.tsv")
print("=" * 60)

triples = []
movies = set()
genres = set()
kept_movies = set()  # Set of tconst that passed all filters (used in later steps)

basics_path = get_file_path(FILES['basics'])
print(f"Reading: {basics_path}")
if USE_FILTERED_MODE:
    print(f"Filtering: titleType='movie', startYear>={YEAR_MIN}, isAdult=0, genres not missing")

chunks = read_tsv_chunked(basics_path, limit=LIMIT_ROWS)
movies_processed = 0
movies_filtered_out = 0

for chunk_idx, chunk in enumerate(chunks):
    print(f"Processing chunk {chunk_idx + 1}... (rows: {len(chunk)})")

    # Filter out rows with missing tconst
    chunk = chunk[chunk['tconst'].notna() & (chunk['tconst'] != '\\N')]

    if USE_FILTERED_MODE:
        # Apply filtering rules
        # 1. Keep only titleType == "movie"
        chunk = chunk[chunk['titleType'] == 'movie']

        # 2. Exclude isAdult == 1
        chunk = chunk[(chunk['isAdult'].isna()) | (chunk['isAdult'] == 0) | (chunk['isAdult'] == '0')]

        # 3. Filter by startYear >= YEAR_MIN and not missing
        chunk = chunk[chunk['startYear'].notna() & (chunk['startYear'] != '\\N')]
        # Convert startYear to numeric, handling errors
        chunk['startYear_numeric'] = pd.to_numeric(chunk['startYear'], errors='coerce')
        chunk = chunk[chunk['startYear_numeric'] >= YEAR_MIN]

        # 4. Drop rows with missing genres
        chunk = chunk[chunk['genres'].notna() & (chunk['genres'] != '\\N') & (chunk['genres'] != '')]

    movies_processed += len(chunk)

    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {chunk_idx + 1}"):
        movie_id = str(row['tconst']).strip()
        if not movie_id or movie_id == '\\N':
            continue

        # Check LIMIT_MOVIES if set
        if LIMIT_MOVIES and len(kept_movies) >= LIMIT_MOVIES:
            break

        movies.add(movie_id)
        kept_movies.add(movie_id)

        # Extract genres
        genre_list = safe_split(row.get('genres', ''))
        for genre in genre_list:
            if genre:
                genres.add(genre)
                triples.append(('HAS_GENRE', movie_id, genre))

    if LIMIT_MOVIES and len(kept_movies) >= LIMIT_MOVIES:
        print(f"Reached LIMIT_MOVIES={LIMIT_MOVIES}, stopping processing")
        break

movies_filtered_out = movies_processed - len(kept_movies)

print(f"\nStep 1 Complete:")
print(f"  Movies processed: {movies_processed:,}")
if USE_FILTERED_MODE:
    print(f"  Movies kept after filtering: {len(kept_movies):,}")
    print(f"  Movies filtered out: {movies_filtered_out:,}")
else:
    print(f"  Movies found: {len(movies):,}")
print(f"  Genres found: {len(genres)}")
print(f"  HAS_GENRE triples: {sum(1 for t in triples if t[0] == 'HAS_GENRE'):,}")


Step 1: Processing title.basics.tsv
Reading: /content/drive/MyDrive/Knowledge Graph/title.basics.tsv
Filtering: titleType='movie', startYear>=1970, isAdult=0, genres not missing
Processing chunk 1... (rows: 100000)


Chunk 1: 100%|██████████| 22597/22597 [00:01<00:00, 20714.08it/s]


Processing chunk 2... (rows: 100000)


Chunk 2: 100%|██████████| 25049/25049 [00:01<00:00, 22535.56it/s]


Processing chunk 3... (rows: 100000)


Chunk 3: 100%|██████████| 19933/19933 [00:00<00:00, 21324.95it/s]


Processing chunk 4... (rows: 100000)


Chunk 4: 100%|██████████| 18095/18095 [00:00<00:00, 23400.03it/s]


Processing chunk 5... (rows: 100000)


Chunk 5: 100%|██████████| 12308/12308 [00:00<00:00, 21723.56it/s]


Processing chunk 6... (rows: 100000)


Chunk 6: 0it [00:00, ?it/s]


Processing chunk 7... (rows: 100000)


Chunk 7: 0it [00:00, ?it/s]


Processing chunk 8... (rows: 100000)


Chunk 8: 100%|██████████| 3245/3245 [00:00<00:00, 24210.59it/s]


Processing chunk 9... (rows: 100000)


Chunk 9: 100%|██████████| 3344/3344 [00:00<00:00, 21820.63it/s]


Processing chunk 10... (rows: 100000)


Chunk 10: 100%|██████████| 2680/2680 [00:00<00:00, 22363.79it/s]


Processing chunk 11... (rows: 100000)


Chunk 11: 100%|██████████| 2724/2724 [00:00<00:00, 21500.06it/s]


Processing chunk 12... (rows: 100000)


Chunk 12: 100%|██████████| 2634/2634 [00:00<00:00, 21802.83it/s]


Processing chunk 13... (rows: 100000)


Chunk 13: 100%|██████████| 2800/2800 [00:00<00:00, 21962.83it/s]


Processing chunk 14... (rows: 100000)


Chunk 14: 100%|██████████| 2374/2374 [00:00<00:00, 20817.97it/s]


Processing chunk 15... (rows: 100000)


Chunk 15: 100%|██████████| 2875/2875 [00:00<00:00, 22288.68it/s]


Processing chunk 16... (rows: 100000)


Chunk 16: 100%|██████████| 2437/2437 [00:00<00:00, 22289.40it/s]


Processing chunk 17... (rows: 100000)


Chunk 17: 100%|██████████| 2609/2609 [00:00<00:00, 21730.81it/s]


Processing chunk 18... (rows: 100000)


Chunk 18: 100%|██████████| 2559/2559 [00:00<00:00, 21779.20it/s]


Processing chunk 19... (rows: 100000)


Chunk 19: 100%|██████████| 2274/2274 [00:00<00:00, 22799.54it/s]


Processing chunk 20... (rows: 100000)


Chunk 20: 100%|██████████| 2503/2503 [00:00<00:00, 19452.69it/s]


Processing chunk 21... (rows: 100000)


Chunk 21: 100%|██████████| 2373/2373 [00:00<00:00, 19941.18it/s]


Processing chunk 22... (rows: 100000)


Chunk 22: 100%|██████████| 2327/2327 [00:00<00:00, 16988.52it/s]


Processing chunk 23... (rows: 100000)


Chunk 23: 100%|██████████| 2450/2450 [00:00<00:00, 21509.16it/s]


Processing chunk 24... (rows: 100000)


Chunk 24: 100%|██████████| 2539/2539 [00:00<00:00, 21857.89it/s]


Processing chunk 25... (rows: 100000)


Chunk 25: 100%|██████████| 2749/2749 [00:00<00:00, 21727.68it/s]


Processing chunk 26... (rows: 100000)


Chunk 26: 100%|██████████| 2517/2517 [00:00<00:00, 21991.09it/s]


Processing chunk 27... (rows: 100000)


Chunk 27: 100%|██████████| 2756/2756 [00:00<00:00, 22290.85it/s]


Processing chunk 28... (rows: 100000)


Chunk 28: 100%|██████████| 3255/3255 [00:00<00:00, 21802.74it/s]


Processing chunk 29... (rows: 100000)


Chunk 29: 100%|██████████| 3612/3612 [00:00<00:00, 22646.63it/s]


Processing chunk 30... (rows: 100000)


Chunk 30: 100%|██████████| 2957/2957 [00:00<00:00, 21995.69it/s]


Processing chunk 31... (rows: 100000)


Chunk 31: 100%|██████████| 2561/2561 [00:00<00:00, 21132.14it/s]


Processing chunk 32... (rows: 100000)


Chunk 32: 100%|██████████| 2819/2819 [00:00<00:00, 21893.47it/s]


Processing chunk 33... (rows: 100000)


Chunk 33: 100%|██████████| 3057/3057 [00:00<00:00, 20956.85it/s]


Processing chunk 34... (rows: 100000)


Chunk 34: 100%|██████████| 2736/2736 [00:00<00:00, 21826.64it/s]


Processing chunk 35... (rows: 100000)


Chunk 35: 100%|██████████| 2847/2847 [00:00<00:00, 21267.11it/s]


Processing chunk 36... (rows: 100000)


Chunk 36: 100%|██████████| 2895/2895 [00:00<00:00, 21153.35it/s]


Processing chunk 37... (rows: 100000)


Chunk 37: 100%|██████████| 2432/2432 [00:00<00:00, 22924.33it/s]


Processing chunk 38... (rows: 100000)


Chunk 38: 100%|██████████| 2895/2895 [00:00<00:00, 22399.35it/s]


Processing chunk 39... (rows: 100000)


Chunk 39: 100%|██████████| 2932/2932 [00:00<00:00, 22270.90it/s]


Processing chunk 40... (rows: 100000)


Chunk 40: 100%|██████████| 3111/3111 [00:00<00:00, 21862.13it/s]


Processing chunk 41... (rows: 100000)


Chunk 41: 100%|██████████| 2169/2169 [00:00<00:00, 21363.93it/s]


Processing chunk 42... (rows: 100000)


Chunk 42: 100%|██████████| 2399/2399 [00:00<00:00, 23370.10it/s]


Processing chunk 43... (rows: 100000)


Chunk 43: 100%|██████████| 2196/2196 [00:00<00:00, 21532.33it/s]


Processing chunk 44... (rows: 100000)


Chunk 44: 100%|██████████| 2275/2275 [00:00<00:00, 23759.71it/s]


Processing chunk 45... (rows: 100000)


Chunk 45: 100%|██████████| 2987/2987 [00:00<00:00, 21509.14it/s]


Processing chunk 46... (rows: 100000)


Chunk 46: 100%|██████████| 3303/3303 [00:00<00:00, 21392.11it/s]


Processing chunk 47... (rows: 100000)


Chunk 47: 100%|██████████| 4068/4068 [00:00<00:00, 23125.43it/s]


Processing chunk 48... (rows: 100000)


Chunk 48: 100%|██████████| 3886/3886 [00:00<00:00, 22465.14it/s]


Processing chunk 49... (rows: 100000)


Chunk 49: 100%|██████████| 3894/3894 [00:00<00:00, 21510.36it/s]


Processing chunk 50... (rows: 100000)


Chunk 50: 100%|██████████| 2729/2729 [00:00<00:00, 22788.32it/s]


Processing chunk 51... (rows: 100000)


Chunk 51: 100%|██████████| 3350/3350 [00:00<00:00, 22476.65it/s]


Processing chunk 52... (rows: 100000)


Chunk 52: 100%|██████████| 3281/3281 [00:00<00:00, 21312.48it/s]


Processing chunk 53... (rows: 100000)


Chunk 53: 100%|██████████| 3552/3552 [00:00<00:00, 21121.31it/s]


Processing chunk 54... (rows: 100000)


Chunk 54: 100%|██████████| 3370/3370 [00:00<00:00, 21925.68it/s]


Processing chunk 55... (rows: 100000)


Chunk 55: 100%|██████████| 2989/2989 [00:00<00:00, 22343.89it/s]


Processing chunk 56... (rows: 100000)


Chunk 56: 100%|██████████| 3514/3514 [00:00<00:00, 22845.34it/s]


Processing chunk 57... (rows: 100000)


Chunk 57: 100%|██████████| 3350/3350 [00:00<00:00, 21669.34it/s]


Processing chunk 58... (rows: 100000)


Chunk 58: 100%|██████████| 3736/3736 [00:00<00:00, 21293.86it/s]


Processing chunk 59... (rows: 100000)


Chunk 59: 100%|██████████| 3762/3762 [00:00<00:00, 22163.25it/s]


Processing chunk 60... (rows: 100000)


Chunk 60: 100%|██████████| 3780/3780 [00:00<00:00, 19441.55it/s]


Processing chunk 61... (rows: 100000)


Chunk 61: 100%|██████████| 4325/4325 [00:00<00:00, 20088.57it/s]


Processing chunk 62... (rows: 100000)


Chunk 62: 100%|██████████| 3812/3812 [00:00<00:00, 20409.67it/s]


Processing chunk 63... (rows: 100000)


Chunk 63: 100%|██████████| 3612/3612 [00:00<00:00, 22296.21it/s]


Processing chunk 64... (rows: 100000)


Chunk 64: 100%|██████████| 2847/2847 [00:00<00:00, 22847.77it/s]


Processing chunk 65... (rows: 100000)


Chunk 65: 100%|██████████| 3288/3288 [00:00<00:00, 24027.62it/s]


Processing chunk 66... (rows: 100000)


Chunk 66: 100%|██████████| 2737/2737 [00:00<00:00, 21286.11it/s]


Processing chunk 67... (rows: 100000)


Chunk 67: 100%|██████████| 3182/3182 [00:00<00:00, 23599.17it/s]


Processing chunk 68... (rows: 100000)


Chunk 68: 100%|██████████| 2836/2836 [00:00<00:00, 22255.61it/s]


Processing chunk 69... (rows: 100000)


Chunk 69: 100%|██████████| 2875/2875 [00:00<00:00, 22374.83it/s]


Processing chunk 70... (rows: 100000)


Chunk 70: 100%|██████████| 2655/2655 [00:00<00:00, 22900.04it/s]


Processing chunk 71... (rows: 100000)


Chunk 71: 100%|██████████| 2852/2852 [00:00<00:00, 20229.85it/s]


Processing chunk 72... (rows: 100000)


Chunk 72: 100%|██████████| 3211/3211 [00:00<00:00, 21556.31it/s]


Processing chunk 73... (rows: 100000)


Chunk 73: 100%|██████████| 2725/2725 [00:00<00:00, 21380.78it/s]


Processing chunk 74... (rows: 100000)


Chunk 74: 100%|██████████| 2728/2728 [00:00<00:00, 22504.43it/s]


Processing chunk 75... (rows: 100000)


Chunk 75: 100%|██████████| 3760/3760 [00:00<00:00, 21519.49it/s]


Processing chunk 76... (rows: 100000)


Chunk 76: 100%|██████████| 3450/3450 [00:00<00:00, 23700.40it/s]


Processing chunk 77... (rows: 100000)


Chunk 77: 100%|██████████| 2962/2962 [00:00<00:00, 22473.54it/s]


Processing chunk 78... (rows: 100000)


Chunk 78: 100%|██████████| 3089/3089 [00:00<00:00, 22221.89it/s]


Processing chunk 79... (rows: 100000)


Chunk 79: 100%|██████████| 2877/2877 [00:00<00:00, 22595.58it/s]


Processing chunk 80... (rows: 100000)


Chunk 80: 100%|██████████| 2794/2794 [00:00<00:00, 18815.97it/s]


Processing chunk 81... (rows: 100000)


Chunk 81: 100%|██████████| 3069/3069 [00:00<00:00, 22153.32it/s]


Processing chunk 82... (rows: 100000)


Chunk 82: 100%|██████████| 2511/2511 [00:00<00:00, 21707.60it/s]


Processing chunk 83... (rows: 100000)


Chunk 83: 100%|██████████| 3153/3153 [00:00<00:00, 21479.70it/s]


Processing chunk 84... (rows: 100000)


Chunk 84: 100%|██████████| 3105/3105 [00:00<00:00, 21899.07it/s]


Processing chunk 85... (rows: 100000)


Chunk 85: 100%|██████████| 3075/3075 [00:00<00:00, 21768.90it/s]


Processing chunk 86... (rows: 100000)


Chunk 86: 100%|██████████| 3163/3163 [00:00<00:00, 21381.54it/s]


Processing chunk 87... (rows: 100000)


Chunk 87: 100%|██████████| 3052/3052 [00:00<00:00, 18749.61it/s]


Processing chunk 88... (rows: 100000)


Chunk 88: 100%|██████████| 2988/2988 [00:00<00:00, 23489.53it/s]


Processing chunk 89... (rows: 100000)


Chunk 89: 100%|██████████| 3241/3241 [00:00<00:00, 22065.11it/s]


Processing chunk 90... (rows: 100000)


Chunk 90: 100%|██████████| 3186/3186 [00:00<00:00, 20975.27it/s]


Processing chunk 91... (rows: 100000)


Chunk 91: 100%|██████████| 3072/3072 [00:00<00:00, 23421.98it/s]


Processing chunk 92... (rows: 100000)


Chunk 92: 100%|██████████| 2955/2955 [00:00<00:00, 23073.86it/s]


Processing chunk 93... (rows: 100000)


Chunk 93: 100%|██████████| 3604/3604 [00:00<00:00, 22097.52it/s]


Processing chunk 94... (rows: 100000)


Chunk 94: 100%|██████████| 4620/4620 [00:00<00:00, 21168.38it/s]


Processing chunk 95... (rows: 100000)


Chunk 95: 100%|██████████| 3839/3839 [00:00<00:00, 23000.10it/s]


Processing chunk 96... (rows: 100000)


Chunk 96: 100%|██████████| 4268/4268 [00:00<00:00, 19830.32it/s]


Processing chunk 97... (rows: 100000)


Chunk 97: 100%|██████████| 3870/3870 [00:00<00:00, 22712.30it/s]


Processing chunk 98... (rows: 100000)


Chunk 98: 100%|██████████| 4305/4305 [00:00<00:00, 21725.32it/s]


Processing chunk 99... (rows: 100000)


Chunk 99: 100%|██████████| 4651/4651 [00:00<00:00, 22474.68it/s]


Processing chunk 100... (rows: 100000)


Chunk 100: 100%|██████████| 3466/3466 [00:00<00:00, 21644.07it/s]


Processing chunk 101... (rows: 100000)


Chunk 101: 100%|██████████| 4214/4214 [00:00<00:00, 23731.34it/s]


Processing chunk 102... (rows: 100000)


Chunk 102: 100%|██████████| 3882/3882 [00:00<00:00, 23434.19it/s]


Processing chunk 103... (rows: 100000)


Chunk 103: 100%|██████████| 4252/4252 [00:00<00:00, 21720.26it/s]


Processing chunk 104... (rows: 100000)


Chunk 104: 100%|██████████| 3597/3597 [00:00<00:00, 21684.39it/s]


Processing chunk 105... (rows: 100000)


Chunk 105: 100%|██████████| 3233/3233 [00:00<00:00, 22057.15it/s]


Processing chunk 106... (rows: 100000)


Chunk 106: 100%|██████████| 3266/3266 [00:00<00:00, 22410.32it/s]


Processing chunk 107... (rows: 100000)


Chunk 107: 100%|██████████| 3054/3054 [00:00<00:00, 22216.45it/s]


Processing chunk 108... (rows: 100000)


Chunk 108: 100%|██████████| 3298/3298 [00:00<00:00, 22549.91it/s]


Processing chunk 109... (rows: 100000)


Chunk 109: 100%|██████████| 3069/3069 [00:00<00:00, 23272.37it/s]


Processing chunk 110... (rows: 100000)


Chunk 110: 100%|██████████| 3123/3123 [00:00<00:00, 24018.92it/s]


Processing chunk 111... (rows: 100000)


Chunk 111: 100%|██████████| 3004/3004 [00:00<00:00, 21888.94it/s]


Processing chunk 112... (rows: 100000)


Chunk 112: 100%|██████████| 2587/2587 [00:00<00:00, 21542.82it/s]


Processing chunk 113... (rows: 100000)


Chunk 113: 100%|██████████| 2744/2744 [00:00<00:00, 22088.25it/s]


Processing chunk 114... (rows: 100000)


Chunk 114: 100%|██████████| 2454/2454 [00:00<00:00, 22668.37it/s]


Processing chunk 115... (rows: 100000)


Chunk 115: 100%|██████████| 2823/2823 [00:00<00:00, 22511.39it/s]


Processing chunk 116... (rows: 100000)


Chunk 116: 100%|██████████| 2823/2823 [00:00<00:00, 21678.79it/s]


Processing chunk 117... (rows: 100000)


Chunk 117: 100%|██████████| 2200/2200 [00:00<00:00, 22773.75it/s]


Processing chunk 118... (rows: 100000)


Chunk 118: 100%|██████████| 1818/1818 [00:00<00:00, 22468.03it/s]


Processing chunk 119... (rows: 100000)


Chunk 119: 100%|██████████| 2151/2151 [00:00<00:00, 22251.03it/s]


Processing chunk 120... (rows: 55435)


Chunk 120: 100%|██████████| 1199/1199 [00:00<00:00, 22184.94it/s]



Step 1 Complete:
  Movies processed: 444,051
  Movies kept after filtering: 444,051
  Movies filtered out: 0
  Genres found: 26
  HAS_GENRE triples: 689,724


In [5]:
# Step 2: Extract relations from title.crew.tsv
# Relations: (person) --DIRECTED--> (movie), (person) --WROTE--> (movie)
# If filtered mode: only process movies in kept_movies set

print("=" * 60)
print("Step 2: Processing title.crew.tsv")
print("=" * 60)

persons = set()
crew_path = get_file_path(FILES['crew'])
print(f"Reading: {crew_path}")
if USE_FILTERED_MODE:
    print(f"Filtering: only movies in kept set ({len(kept_movies):,} movies)")

chunks = read_tsv_chunked(crew_path, limit=LIMIT_ROWS)
directed_count = 0
wrote_count = 0
crew_rows_processed = 0
crew_rows_filtered = 0

for chunk_idx, chunk in enumerate(chunks):
    print(f"Processing chunk {chunk_idx + 1}... (rows: {len(chunk)})")

    # Filter out rows with missing tconst
    chunk = chunk[chunk['tconst'].notna() & (chunk['tconst'] != '\\N')]
    crew_rows_processed += len(chunk)

    if USE_FILTERED_MODE:
        # Filter to only keep movies that passed the movie filter
        chunk['tconst_str'] = chunk['tconst'].astype(str).str.strip()
        chunk = chunk[chunk['tconst_str'].isin(kept_movies)]
        crew_rows_filtered += len(chunk)

    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {chunk_idx + 1}"):
        movie_id = str(row['tconst']).strip()
        if not movie_id or movie_id == '\\N':
            continue

        # Double-check if in kept_movies (for filtered mode)
        if USE_FILTERED_MODE and movie_id not in kept_movies:
            continue

        # Extract directors
        directors = safe_split(row.get('directors', ''))
        for director_id in directors:
            if director_id:
                persons.add(director_id)
                triples.append(('DIRECTED', director_id, movie_id))
                directed_count += 1

        # Extract writers
        writers = safe_split(row.get('writers', ''))
        for writer_id in writers:
            if writer_id:
                persons.add(writer_id)
                triples.append(('WROTE', writer_id, movie_id))
                wrote_count += 1

print(f"\nStep 2 Complete:")
if USE_FILTERED_MODE:
    print(f"  Crew rows processed: {crew_rows_processed:,}")
    print(f"  Crew rows kept: {crew_rows_filtered:,}")
    print(f"  Crew rows filtered out: {crew_rows_processed - crew_rows_filtered:,}")
print(f"  Persons found so far: {len(persons):,}")
print(f"  DIRECTED triples: {directed_count:,}")
print(f"  WROTE triples: {wrote_count:,}")


Step 2: Processing title.crew.tsv
Reading: /content/drive/MyDrive/Knowledge Graph/title.crew.tsv
Filtering: only movies in kept set (444,051 movies)
Processing chunk 1... (rows: 100000)


Chunk 1: 100%|██████████| 22597/22597 [00:01<00:00, 18726.11it/s]


Processing chunk 2... (rows: 100000)


Chunk 2: 100%|██████████| 25049/25049 [00:01<00:00, 19758.07it/s]


Processing chunk 3... (rows: 100000)


Chunk 3: 100%|██████████| 19933/19933 [00:00<00:00, 21691.67it/s]


Processing chunk 4... (rows: 100000)


Chunk 4: 100%|██████████| 18095/18095 [00:00<00:00, 22407.84it/s]


Processing chunk 5... (rows: 100000)


Chunk 5: 100%|██████████| 12308/12308 [00:00<00:00, 21529.81it/s]


Processing chunk 6... (rows: 100000)


Chunk 6: 0it [00:00, ?it/s]


Processing chunk 7... (rows: 100000)


Chunk 7: 0it [00:00, ?it/s]


Processing chunk 8... (rows: 100000)


Chunk 8: 100%|██████████| 3245/3245 [00:00<00:00, 16658.59it/s]


Processing chunk 9... (rows: 100000)


Chunk 9: 100%|██████████| 3344/3344 [00:00<00:00, 21740.47it/s]


Processing chunk 10... (rows: 100000)


Chunk 10: 100%|██████████| 2680/2680 [00:00<00:00, 22100.85it/s]


Processing chunk 11... (rows: 100000)


Chunk 11: 100%|██████████| 2724/2724 [00:00<00:00, 22340.18it/s]


Processing chunk 12... (rows: 100000)


Chunk 12: 100%|██████████| 2634/2634 [00:00<00:00, 21052.48it/s]


Processing chunk 13... (rows: 100000)


Chunk 13: 100%|██████████| 2800/2800 [00:00<00:00, 21546.03it/s]


Processing chunk 14... (rows: 100000)


Chunk 14: 100%|██████████| 2374/2374 [00:00<00:00, 21750.23it/s]


Processing chunk 15... (rows: 100000)


Chunk 15: 100%|██████████| 2875/2875 [00:00<00:00, 21900.84it/s]


Processing chunk 16... (rows: 100000)


Chunk 16: 100%|██████████| 2437/2437 [00:00<00:00, 18928.84it/s]


Processing chunk 17... (rows: 100000)


Chunk 17: 100%|██████████| 2609/2609 [00:00<00:00, 22366.99it/s]


Processing chunk 18... (rows: 100000)


Chunk 18: 100%|██████████| 2559/2559 [00:00<00:00, 21904.18it/s]


Processing chunk 19... (rows: 100000)


Chunk 19: 100%|██████████| 2274/2274 [00:00<00:00, 22103.63it/s]


Processing chunk 20... (rows: 100000)


Chunk 20: 100%|██████████| 2503/2503 [00:00<00:00, 21967.66it/s]


Processing chunk 21... (rows: 100000)


Chunk 21: 100%|██████████| 2373/2373 [00:00<00:00, 21876.93it/s]


Processing chunk 22... (rows: 100000)


Chunk 22: 100%|██████████| 2327/2327 [00:00<00:00, 22386.37it/s]


Processing chunk 23... (rows: 100000)


Chunk 23: 100%|██████████| 2450/2450 [00:00<00:00, 21151.74it/s]


Processing chunk 24... (rows: 100000)


Chunk 24: 100%|██████████| 2539/2539 [00:00<00:00, 22117.00it/s]


Processing chunk 25... (rows: 100000)


Chunk 25: 100%|██████████| 2749/2749 [00:00<00:00, 22017.74it/s]


Processing chunk 26... (rows: 100000)


Chunk 26: 100%|██████████| 2517/2517 [00:00<00:00, 21777.92it/s]


Processing chunk 27... (rows: 100000)


Chunk 27: 100%|██████████| 2756/2756 [00:00<00:00, 21962.92it/s]


Processing chunk 28... (rows: 100000)


Chunk 28: 100%|██████████| 3255/3255 [00:00<00:00, 21846.14it/s]


Processing chunk 29... (rows: 100000)


Chunk 29: 100%|██████████| 3612/3612 [00:00<00:00, 22254.71it/s]


Processing chunk 30... (rows: 100000)


Chunk 30: 100%|██████████| 2957/2957 [00:00<00:00, 22334.64it/s]


Processing chunk 31... (rows: 100000)


Chunk 31: 100%|██████████| 2561/2561 [00:00<00:00, 21904.31it/s]


Processing chunk 32... (rows: 100000)


Chunk 32: 100%|██████████| 2819/2819 [00:00<00:00, 20420.58it/s]


Processing chunk 33... (rows: 100000)


Chunk 33: 100%|██████████| 3057/3057 [00:00<00:00, 20980.00it/s]


Processing chunk 34... (rows: 100000)


Chunk 34: 100%|██████████| 2736/2736 [00:00<00:00, 21757.65it/s]


Processing chunk 35... (rows: 100000)


Chunk 35: 100%|██████████| 2847/2847 [00:00<00:00, 21154.72it/s]


Processing chunk 36... (rows: 100000)


Chunk 36: 100%|██████████| 2895/2895 [00:00<00:00, 21455.55it/s]


Processing chunk 37... (rows: 100000)


Chunk 37: 100%|██████████| 2432/2432 [00:00<00:00, 21736.13it/s]


Processing chunk 38... (rows: 100000)


Chunk 38: 100%|██████████| 2895/2895 [00:00<00:00, 21608.77it/s]


Processing chunk 39... (rows: 100000)


Chunk 39: 100%|██████████| 2932/2932 [00:00<00:00, 21900.42it/s]


Processing chunk 40... (rows: 100000)


Chunk 40: 100%|██████████| 3111/3111 [00:00<00:00, 22082.38it/s]


Processing chunk 41... (rows: 100000)


Chunk 41: 100%|██████████| 2169/2169 [00:00<00:00, 16837.11it/s]


Processing chunk 42... (rows: 100000)


Chunk 42: 100%|██████████| 2399/2399 [00:00<00:00, 21988.12it/s]


Processing chunk 43... (rows: 100000)


Chunk 43: 100%|██████████| 2196/2196 [00:00<00:00, 21803.14it/s]


Processing chunk 44... (rows: 100000)


Chunk 44: 100%|██████████| 2275/2275 [00:00<00:00, 21759.30it/s]


Processing chunk 45... (rows: 100000)


Chunk 45: 100%|██████████| 2987/2987 [00:00<00:00, 20898.09it/s]


Processing chunk 46... (rows: 100000)


Chunk 46: 100%|██████████| 3303/3303 [00:00<00:00, 21376.96it/s]


Processing chunk 47... (rows: 100000)


Chunk 47: 100%|██████████| 4068/4068 [00:00<00:00, 21082.79it/s]


Processing chunk 48... (rows: 100000)


Chunk 48: 100%|██████████| 3886/3886 [00:00<00:00, 22089.97it/s]


Processing chunk 49... (rows: 100000)


Chunk 49: 100%|██████████| 3894/3894 [00:00<00:00, 21649.04it/s]


Processing chunk 50... (rows: 100000)


Chunk 50: 100%|██████████| 2729/2729 [00:00<00:00, 21360.37it/s]


Processing chunk 51... (rows: 100000)


Chunk 51: 100%|██████████| 3350/3350 [00:00<00:00, 21879.59it/s]


Processing chunk 52... (rows: 100000)


Chunk 52: 100%|██████████| 3281/3281 [00:00<00:00, 21731.70it/s]


Processing chunk 53... (rows: 100000)


Chunk 53: 100%|██████████| 3552/3552 [00:00<00:00, 21349.07it/s]


Processing chunk 54... (rows: 100000)


Chunk 54: 100%|██████████| 3370/3370 [00:00<00:00, 21073.35it/s]


Processing chunk 55... (rows: 100000)


Chunk 55: 100%|██████████| 2989/2989 [00:00<00:00, 21626.39it/s]


Processing chunk 56... (rows: 100000)


Chunk 56: 100%|██████████| 3514/3514 [00:00<00:00, 22043.29it/s]


Processing chunk 57... (rows: 100000)


Chunk 57: 100%|██████████| 3350/3350 [00:00<00:00, 21420.85it/s]


Processing chunk 58... (rows: 100000)


Chunk 58: 100%|██████████| 3736/3736 [00:00<00:00, 21884.35it/s]


Processing chunk 59... (rows: 100000)


Chunk 59: 100%|██████████| 3762/3762 [00:00<00:00, 20902.26it/s]


Processing chunk 60... (rows: 100000)


Chunk 60: 100%|██████████| 3780/3780 [00:00<00:00, 21652.01it/s]


Processing chunk 61... (rows: 100000)


Chunk 61: 100%|██████████| 4325/4325 [00:00<00:00, 21949.74it/s]


Processing chunk 62... (rows: 100000)


Chunk 62: 100%|██████████| 3812/3812 [00:00<00:00, 21813.50it/s]


Processing chunk 63... (rows: 100000)


Chunk 63: 100%|██████████| 3612/3612 [00:00<00:00, 20060.76it/s]


Processing chunk 64... (rows: 100000)


Chunk 64: 100%|██████████| 2847/2847 [00:00<00:00, 20984.46it/s]


Processing chunk 65... (rows: 100000)


Chunk 65: 100%|██████████| 3288/3288 [00:00<00:00, 20941.71it/s]


Processing chunk 66... (rows: 100000)


Chunk 66: 100%|██████████| 2737/2737 [00:00<00:00, 21293.61it/s]


Processing chunk 67... (rows: 100000)


Chunk 67: 100%|██████████| 3182/3182 [00:00<00:00, 21712.42it/s]


Processing chunk 68... (rows: 100000)


Chunk 68: 100%|██████████| 2836/2836 [00:00<00:00, 21575.03it/s]


Processing chunk 69... (rows: 100000)


Chunk 69: 100%|██████████| 2875/2875 [00:00<00:00, 21370.88it/s]


Processing chunk 70... (rows: 100000)


Chunk 70: 100%|██████████| 2655/2655 [00:00<00:00, 21242.96it/s]


Processing chunk 71... (rows: 100000)


Chunk 71: 100%|██████████| 2852/2852 [00:00<00:00, 20542.66it/s]


Processing chunk 72... (rows: 100000)


Chunk 72: 100%|██████████| 3211/3211 [00:00<00:00, 21630.51it/s]


Processing chunk 73... (rows: 100000)


Chunk 73: 100%|██████████| 2725/2725 [00:00<00:00, 21107.62it/s]


Processing chunk 74... (rows: 100000)


Chunk 74: 100%|██████████| 2728/2728 [00:00<00:00, 21408.32it/s]


Processing chunk 75... (rows: 100000)


Chunk 75: 100%|██████████| 3760/3760 [00:00<00:00, 22106.01it/s]


Processing chunk 76... (rows: 100000)


Chunk 76: 100%|██████████| 3450/3450 [00:00<00:00, 22438.93it/s]


Processing chunk 77... (rows: 100000)


Chunk 77: 100%|██████████| 2962/2962 [00:00<00:00, 22212.36it/s]


Processing chunk 78... (rows: 100000)


Chunk 78: 100%|██████████| 3089/3089 [00:00<00:00, 21848.58it/s]


Processing chunk 79... (rows: 100000)


Chunk 79: 100%|██████████| 2877/2877 [00:00<00:00, 20045.44it/s]


Processing chunk 80... (rows: 100000)


Chunk 80: 100%|██████████| 2794/2794 [00:00<00:00, 21955.84it/s]


Processing chunk 81... (rows: 100000)


Chunk 81: 100%|██████████| 3069/3069 [00:00<00:00, 22089.07it/s]


Processing chunk 82... (rows: 100000)


Chunk 82: 100%|██████████| 2511/2511 [00:00<00:00, 20390.54it/s]


Processing chunk 83... (rows: 100000)


Chunk 83: 100%|██████████| 3153/3153 [00:00<00:00, 21723.86it/s]


Processing chunk 84... (rows: 100000)


Chunk 84: 100%|██████████| 3105/3105 [00:00<00:00, 22131.93it/s]


Processing chunk 85... (rows: 100000)


Chunk 85: 100%|██████████| 3075/3075 [00:00<00:00, 18705.75it/s]


Processing chunk 86... (rows: 100000)


Chunk 86: 100%|██████████| 3163/3163 [00:00<00:00, 22044.58it/s]


Processing chunk 87... (rows: 100000)


Chunk 87: 100%|██████████| 3052/3052 [00:00<00:00, 22123.74it/s]


Processing chunk 88... (rows: 100000)


Chunk 88: 100%|██████████| 2988/2988 [00:00<00:00, 21870.07it/s]


Processing chunk 89... (rows: 100000)


Chunk 89: 100%|██████████| 3241/3241 [00:00<00:00, 22190.71it/s]


Processing chunk 90... (rows: 100000)


Chunk 90: 100%|██████████| 3186/3186 [00:00<00:00, 21993.36it/s]


Processing chunk 91... (rows: 100000)


Chunk 91: 100%|██████████| 3072/3072 [00:00<00:00, 21979.00it/s]


Processing chunk 92... (rows: 100000)


Chunk 92: 100%|██████████| 2955/2955 [00:00<00:00, 22252.05it/s]


Processing chunk 93... (rows: 100000)


Chunk 93: 100%|██████████| 3604/3604 [00:00<00:00, 20127.44it/s]


Processing chunk 94... (rows: 100000)


Chunk 94: 100%|██████████| 4620/4620 [00:00<00:00, 21973.88it/s]


Processing chunk 95... (rows: 100000)


Chunk 95: 100%|██████████| 3839/3839 [00:00<00:00, 22157.79it/s]


Processing chunk 96... (rows: 100000)


Chunk 96: 100%|██████████| 4268/4268 [00:00<00:00, 22030.84it/s]


Processing chunk 97... (rows: 100000)


Chunk 97: 100%|██████████| 3870/3870 [00:00<00:00, 22113.84it/s]


Processing chunk 98... (rows: 100000)


Chunk 98: 100%|██████████| 4305/4305 [00:00<00:00, 22238.71it/s]


Processing chunk 99... (rows: 100000)


Chunk 99: 100%|██████████| 4651/4651 [00:00<00:00, 22286.65it/s]


Processing chunk 100... (rows: 100000)


Chunk 100: 100%|██████████| 3466/3466 [00:00<00:00, 21807.03it/s]


Processing chunk 101... (rows: 100000)


Chunk 101: 100%|██████████| 4214/4214 [00:00<00:00, 21832.70it/s]


Processing chunk 102... (rows: 100000)


Chunk 102: 100%|██████████| 3882/3882 [00:00<00:00, 21804.52it/s]


Processing chunk 103... (rows: 100000)


Chunk 103: 100%|██████████| 4252/4252 [00:00<00:00, 21679.73it/s]


Processing chunk 104... (rows: 100000)


Chunk 104: 100%|██████████| 3597/3597 [00:00<00:00, 20635.68it/s]


Processing chunk 105... (rows: 100000)


Chunk 105: 100%|██████████| 3233/3233 [00:00<00:00, 21534.84it/s]


Processing chunk 106... (rows: 100000)


Chunk 106: 100%|██████████| 3266/3266 [00:00<00:00, 21633.68it/s]


Processing chunk 107... (rows: 100000)


Chunk 107: 100%|██████████| 3054/3054 [00:00<00:00, 21352.32it/s]


Processing chunk 108... (rows: 100000)


Chunk 108: 100%|██████████| 3298/3298 [00:00<00:00, 21675.11it/s]


Processing chunk 109... (rows: 100000)


Chunk 109: 100%|██████████| 3069/3069 [00:00<00:00, 21032.17it/s]


Processing chunk 110... (rows: 100000)


Chunk 110: 100%|██████████| 3123/3123 [00:00<00:00, 21148.54it/s]


Processing chunk 111... (rows: 100000)


Chunk 111: 100%|██████████| 3004/3004 [00:00<00:00, 20011.55it/s]


Processing chunk 112... (rows: 100000)


Chunk 112: 100%|██████████| 2587/2587 [00:00<00:00, 21092.51it/s]


Processing chunk 113... (rows: 100000)


Chunk 113: 100%|██████████| 2744/2744 [00:00<00:00, 22126.04it/s]


Processing chunk 114... (rows: 100000)


Chunk 114: 100%|██████████| 2454/2454 [00:00<00:00, 20349.55it/s]


Processing chunk 115... (rows: 100000)


Chunk 115: 100%|██████████| 2823/2823 [00:00<00:00, 21403.92it/s]


Processing chunk 116... (rows: 100000)


Chunk 116: 100%|██████████| 2823/2823 [00:00<00:00, 21720.86it/s]


Processing chunk 117... (rows: 100000)


Chunk 117: 100%|██████████| 2200/2200 [00:00<00:00, 21723.60it/s]


Processing chunk 118... (rows: 100000)


Chunk 118: 100%|██████████| 1818/1818 [00:00<00:00, 21848.96it/s]


Processing chunk 119... (rows: 100000)


Chunk 119: 100%|██████████| 2151/2151 [00:00<00:00, 21905.53it/s]


Processing chunk 120... (rows: 55435)


Chunk 120: 100%|██████████| 1199/1199 [00:00<00:00, 19406.91it/s]


Step 2 Complete:
  Crew rows processed: 11,955,435
  Crew rows kept: 444,051
  Crew rows filtered out: 11,511,384
  Persons found so far: 373,183
  DIRECTED triples: 493,393
  WROTE triples: 586,518





In [6]:
# Step 3: Extract relations from title.principals.tsv
# Relations: (person) --ACTED_IN--> (movie) where category in {"actor", "actress"}
# If filtered mode: only process movies in kept_movies set AND ordering <= TOP_BILLING

print("=" * 60)
print("Step 3: Processing title.principals.tsv")
print("=" * 60)

principals_path = get_file_path(FILES['principals'])
print(f"Reading: {principals_path}")
if USE_FILTERED_MODE:
    print(f"Filtering: only movies in kept set, ordering <= {TOP_BILLING}")

chunks = read_tsv_chunked(principals_path, limit=LIMIT_ROWS)
acted_count = 0
principals_rows_processed = 0
principals_rows_filtered = 0

# Valid acting categories
ACTING_CATEGORIES = {'actor', 'actress'}

for chunk_idx, chunk in enumerate(chunks):
    print(f"Processing chunk {chunk_idx + 1}... (rows: {len(chunk)})")

    # Filter for acting roles only
    chunk = chunk[
        chunk['tconst'].notna() &
        (chunk['tconst'] != '\\N') &
        chunk['nconst'].notna() &
        (chunk['nconst'] != '\\N') &
        chunk['category'].notna()
    ]

    # Filter by category
    chunk = chunk[chunk['category'].str.lower().isin(ACTING_CATEGORIES)]
    principals_rows_processed += len(chunk)

    if USE_FILTERED_MODE:
        # Filter to only keep movies that passed the movie filter
        chunk['tconst_str'] = chunk['tconst'].astype(str).str.strip()
        chunk = chunk[chunk['tconst_str'].isin(kept_movies)]

        # Filter by ordering <= TOP_BILLING
        # Convert ordering to numeric, handling missing values
        chunk['ordering_numeric'] = pd.to_numeric(chunk['ordering'], errors='coerce')
        chunk = chunk[chunk['ordering_numeric'].notna() & (chunk['ordering_numeric'] <= TOP_BILLING)]
        principals_rows_filtered += len(chunk)

    for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {chunk_idx + 1}"):
        movie_id = str(row['tconst']).strip()
        person_id = str(row['nconst']).strip()

        if movie_id and person_id and movie_id != '\\N' and person_id != '\\N':
            # Double-check if in kept_movies (for filtered mode)
            if USE_FILTERED_MODE and movie_id not in kept_movies:
                continue

            # Double-check ordering for filtered mode
            if USE_FILTERED_MODE:
                try:
                    ordering_val = pd.to_numeric(row.get('ordering'), errors='coerce')
                    if pd.isna(ordering_val) or ordering_val > TOP_BILLING:
                        continue
                except:
                    continue

            persons.add(person_id)
            triples.append(('ACTED_IN', person_id, movie_id))
            acted_count += 1

print(f"\nStep 3 Complete:")
if USE_FILTERED_MODE:
    print(f"  Principals rows processed: {principals_rows_processed:,}")
    print(f"  Principals rows kept: {principals_rows_filtered:,}")
    print(f"  Principals rows filtered out: {principals_rows_processed - principals_rows_filtered:,}")
print(f"  Total persons: {len(persons):,}")
print(f"  ACTED_IN triples: {acted_count:,}")


Step 3: Processing title.principals.tsv
Reading: /content/drive/MyDrive/Knowledge Graph/title.principals.tsv
Filtering: only movies in kept set, ordering <= 5
Processing chunk 1... (rows: 100000)


Chunk 1: 0it [00:00, ?it/s]


Processing chunk 2... (rows: 100000)


Chunk 2: 100%|██████████| 10/10 [00:00<00:00, 8750.90it/s]


Processing chunk 3... (rows: 100000)


Chunk 3: 0it [00:00, ?it/s]


Processing chunk 4... (rows: 100000)


Chunk 4: 0it [00:00, ?it/s]


Processing chunk 5... (rows: 100000)


Chunk 5: 100%|██████████| 21/21 [00:00<00:00, 13875.30it/s]


Processing chunk 6... (rows: 100000)


Chunk 6: 100%|██████████| 5/5 [00:00<00:00, 8856.22it/s]


Processing chunk 7... (rows: 100000)


Chunk 7: 100%|██████████| 5/5 [00:00<00:00, 6427.07it/s]


Processing chunk 8... (rows: 100000)


Chunk 8: 100%|██████████| 50/50 [00:00<00:00, 10823.45it/s]


Processing chunk 9... (rows: 100000)


Chunk 9: 100%|██████████| 4591/4591 [00:00<00:00, 20209.90it/s]


Processing chunk 10... (rows: 100000)


Chunk 10: 100%|██████████| 20196/20196 [00:00<00:00, 22477.58it/s]


Processing chunk 11... (rows: 100000)


Chunk 11: 100%|██████████| 18779/18779 [00:00<00:00, 21589.65it/s]


Processing chunk 12... (rows: 100000)


Chunk 12: 100%|██████████| 18325/18325 [00:00<00:00, 21577.48it/s]


Processing chunk 13... (rows: 100000)


Chunk 13: 100%|██████████| 18039/18039 [00:00<00:00, 22520.02it/s]


Processing chunk 14... (rows: 100000)


Chunk 14: 100%|██████████| 17743/17743 [00:00<00:00, 22622.77it/s]


Processing chunk 15... (rows: 100000)


Chunk 15: 100%|██████████| 16759/16759 [00:00<00:00, 22529.75it/s]


Processing chunk 16... (rows: 100000)


Chunk 16: 100%|██████████| 15865/15865 [00:00<00:00, 22092.51it/s]


Processing chunk 17... (rows: 100000)


Chunk 17: 100%|██████████| 13887/13887 [00:00<00:00, 22219.48it/s]


Processing chunk 18... (rows: 100000)


Chunk 18: 100%|██████████| 12235/12235 [00:00<00:00, 22693.22it/s]


Processing chunk 19... (rows: 100000)


Chunk 19: 100%|██████████| 5967/5967 [00:00<00:00, 22150.47it/s]


Processing chunk 20... (rows: 100000)


Chunk 20: 100%|██████████| 6033/6033 [00:00<00:00, 22507.44it/s]


Processing chunk 21... (rows: 100000)


Chunk 21: 100%|██████████| 6184/6184 [00:00<00:00, 21019.97it/s]


Processing chunk 22... (rows: 100000)


Chunk 22: 100%|██████████| 7305/7305 [00:00<00:00, 22035.72it/s]


Processing chunk 23... (rows: 100000)


Chunk 23: 100%|██████████| 7180/7180 [00:00<00:00, 20618.32it/s]


Processing chunk 24... (rows: 100000)


Chunk 24: 100%|██████████| 8200/8200 [00:00<00:00, 21823.71it/s]


Processing chunk 25... (rows: 100000)


Chunk 25: 100%|██████████| 7209/7209 [00:00<00:00, 22227.14it/s]


Processing chunk 26... (rows: 100000)


Chunk 26: 100%|██████████| 7796/7796 [00:00<00:00, 22274.09it/s]


Processing chunk 27... (rows: 100000)


Chunk 27: 100%|██████████| 7161/7161 [00:00<00:00, 22554.36it/s]


Processing chunk 28... (rows: 100000)


Chunk 28: 100%|██████████| 6164/6164 [00:00<00:00, 22211.67it/s]


Processing chunk 29... (rows: 100000)


Chunk 29: 100%|██████████| 7574/7574 [00:00<00:00, 22526.84it/s]


Processing chunk 30... (rows: 100000)


Chunk 30: 100%|██████████| 7724/7724 [00:00<00:00, 22194.08it/s]


Processing chunk 31... (rows: 100000)


Chunk 31: 100%|██████████| 8719/8719 [00:00<00:00, 22618.26it/s]


Processing chunk 32... (rows: 100000)


Chunk 32: 100%|██████████| 8988/8988 [00:00<00:00, 21278.41it/s]


Processing chunk 33... (rows: 100000)


Chunk 33: 100%|██████████| 8217/8217 [00:00<00:00, 21256.89it/s]


Processing chunk 34... (rows: 100000)


Chunk 34: 100%|██████████| 9903/9903 [00:00<00:00, 21198.01it/s]


Processing chunk 35... (rows: 100000)


Chunk 35: 100%|██████████| 7635/7635 [00:00<00:00, 21013.29it/s]


Processing chunk 36... (rows: 100000)


Chunk 36: 100%|██████████| 7443/7443 [00:00<00:00, 22532.61it/s]


Processing chunk 37... (rows: 100000)


Chunk 37: 100%|██████████| 8366/8366 [00:00<00:00, 22579.52it/s]


Processing chunk 38... (rows: 100000)


Chunk 38: 100%|██████████| 6250/6250 [00:00<00:00, 22622.90it/s]


Processing chunk 39... (rows: 100000)


Chunk 39: 100%|██████████| 8008/8008 [00:00<00:00, 22617.30it/s]


Processing chunk 40... (rows: 100000)


Chunk 40: 100%|██████████| 8691/8691 [00:00<00:00, 22884.51it/s]


Processing chunk 41... (rows: 100000)


Chunk 41: 100%|██████████| 8661/8661 [00:00<00:00, 22857.90it/s]


Processing chunk 42... (rows: 100000)


Chunk 42: 100%|██████████| 7905/7905 [00:00<00:00, 22369.12it/s]


Processing chunk 43... (rows: 100000)


Chunk 43: 100%|██████████| 9111/9111 [00:00<00:00, 22635.43it/s]


Processing chunk 44... (rows: 100000)


Chunk 44: 100%|██████████| 6058/6058 [00:00<00:00, 22396.95it/s]


Processing chunk 45... (rows: 100000)


Chunk 45: 100%|██████████| 5989/5989 [00:00<00:00, 22830.04it/s]


Processing chunk 46... (rows: 100000)


Chunk 46: 100%|██████████| 5654/5654 [00:00<00:00, 22594.80it/s]


Processing chunk 47... (rows: 100000)


Chunk 47: 100%|██████████| 6972/6972 [00:00<00:00, 22703.02it/s]


Processing chunk 48... (rows: 100000)


Chunk 48: 100%|██████████| 5554/5554 [00:00<00:00, 22558.67it/s]


Processing chunk 49... (rows: 100000)


Chunk 49: 100%|██████████| 5845/5845 [00:00<00:00, 21356.93it/s]


Processing chunk 50... (rows: 100000)


Chunk 50: 100%|██████████| 6089/6089 [00:00<00:00, 22369.65it/s]


Processing chunk 51... (rows: 100000)


Chunk 51: 100%|██████████| 5278/5278 [00:00<00:00, 21891.84it/s]


Processing chunk 52... (rows: 100000)


Chunk 52: 100%|██████████| 6273/6273 [00:00<00:00, 22077.53it/s]


Processing chunk 53... (rows: 100000)


Chunk 53: 100%|██████████| 5801/5801 [00:00<00:00, 21303.13it/s]


Processing chunk 54... (rows: 100000)


Chunk 54: 100%|██████████| 5504/5504 [00:00<00:00, 22747.69it/s]


Processing chunk 55... (rows: 100000)


Chunk 55: 0it [00:00, ?it/s]


Processing chunk 56... (rows: 100000)


Chunk 56: 0it [00:00, ?it/s]


Processing chunk 57... (rows: 100000)


Chunk 57: 0it [00:00, ?it/s]


Processing chunk 58... (rows: 100000)


Chunk 58: 0it [00:00, ?it/s]


Processing chunk 59... (rows: 100000)


Chunk 59: 0it [00:00, ?it/s]


Processing chunk 60... (rows: 100000)


Chunk 60: 0it [00:00, ?it/s]


Processing chunk 61... (rows: 100000)


Chunk 61: 0it [00:00, ?it/s]


Processing chunk 62... (rows: 100000)


Chunk 62: 0it [00:00, ?it/s]


Processing chunk 63... (rows: 100000)


Chunk 63: 0it [00:00, ?it/s]


Processing chunk 64... (rows: 100000)


Chunk 64: 0it [00:00, ?it/s]


Processing chunk 65... (rows: 100000)


Chunk 65: 0it [00:00, ?it/s]


Processing chunk 66... (rows: 100000)


Chunk 66: 0it [00:00, ?it/s]


Processing chunk 67... (rows: 100000)


Chunk 67: 0it [00:00, ?it/s]


Processing chunk 68... (rows: 100000)


Chunk 68: 0it [00:00, ?it/s]


Processing chunk 69... (rows: 100000)


Chunk 69: 0it [00:00, ?it/s]


Processing chunk 70... (rows: 100000)


Chunk 70: 0it [00:00, ?it/s]


Processing chunk 71... (rows: 100000)


Chunk 71: 0it [00:00, ?it/s]


Processing chunk 72... (rows: 100000)


Chunk 72: 0it [00:00, ?it/s]


Processing chunk 73... (rows: 100000)


Chunk 73: 0it [00:00, ?it/s]


Processing chunk 74... (rows: 100000)


Chunk 74: 0it [00:00, ?it/s]


Processing chunk 75... (rows: 100000)


Chunk 75: 0it [00:00, ?it/s]


Processing chunk 76... (rows: 100000)


Chunk 76: 0it [00:00, ?it/s]


Processing chunk 77... (rows: 100000)


Chunk 77: 0it [00:00, ?it/s]


Processing chunk 78... (rows: 100000)


Chunk 78: 0it [00:00, ?it/s]


Processing chunk 79... (rows: 100000)


Chunk 79: 0it [00:00, ?it/s]


Processing chunk 80... (rows: 100000)


Chunk 80: 0it [00:00, ?it/s]


Processing chunk 81... (rows: 100000)


Chunk 81: 0it [00:00, ?it/s]


Processing chunk 82... (rows: 100000)


Chunk 82: 0it [00:00, ?it/s]


Processing chunk 83... (rows: 100000)


Chunk 83: 0it [00:00, ?it/s]


Processing chunk 84... (rows: 100000)


Chunk 84: 0it [00:00, ?it/s]


Processing chunk 85... (rows: 100000)


Chunk 85: 0it [00:00, ?it/s]


Processing chunk 86... (rows: 100000)


Chunk 86: 0it [00:00, ?it/s]


Processing chunk 87... (rows: 100000)


Chunk 87: 0it [00:00, ?it/s]


Processing chunk 88... (rows: 100000)


Chunk 88: 100%|██████████| 1103/1103 [00:00<00:00, 20370.11it/s]


Processing chunk 89... (rows: 100000)


Chunk 89: 100%|██████████| 846/846 [00:00<00:00, 20337.94it/s]


Processing chunk 90... (rows: 100000)


Chunk 90: 100%|██████████| 1456/1456 [00:00<00:00, 22247.38it/s]


Processing chunk 91... (rows: 100000)


Chunk 91: 100%|██████████| 1636/1636 [00:00<00:00, 21627.48it/s]


Processing chunk 92... (rows: 100000)


Chunk 92: 100%|██████████| 1172/1172 [00:00<00:00, 21223.05it/s]


Processing chunk 93... (rows: 100000)


Chunk 93: 100%|██████████| 1277/1277 [00:00<00:00, 21132.29it/s]


Processing chunk 94... (rows: 100000)


Chunk 94: 100%|██████████| 2021/2021 [00:00<00:00, 19196.27it/s]


Processing chunk 95... (rows: 100000)


Chunk 95: 100%|██████████| 1969/1969 [00:00<00:00, 18374.66it/s]


Processing chunk 96... (rows: 100000)


Chunk 96: 100%|██████████| 1761/1761 [00:00<00:00, 20239.35it/s]


Processing chunk 97... (rows: 100000)


Chunk 97: 100%|██████████| 1350/1350 [00:00<00:00, 20717.46it/s]


Processing chunk 98... (rows: 100000)


Chunk 98: 100%|██████████| 1457/1457 [00:00<00:00, 21676.57it/s]


Processing chunk 99... (rows: 100000)


Chunk 99: 100%|██████████| 1043/1043 [00:00<00:00, 20135.78it/s]


Processing chunk 100... (rows: 100000)


Chunk 100: 100%|██████████| 855/855 [00:00<00:00, 16242.12it/s]


Processing chunk 101... (rows: 100000)


Chunk 101: 100%|██████████| 1236/1236 [00:00<00:00, 21503.63it/s]


Processing chunk 102... (rows: 100000)


Chunk 102: 100%|██████████| 1068/1068 [00:00<00:00, 21301.21it/s]


Processing chunk 103... (rows: 100000)


Chunk 103: 100%|██████████| 1007/1007 [00:00<00:00, 20009.02it/s]


Processing chunk 104... (rows: 100000)


Chunk 104: 100%|██████████| 951/951 [00:00<00:00, 20946.19it/s]


Processing chunk 105... (rows: 100000)


Chunk 105: 100%|██████████| 799/799 [00:00<00:00, 20428.34it/s]


Processing chunk 106... (rows: 100000)


Chunk 106: 100%|██████████| 383/383 [00:00<00:00, 17854.45it/s]


Processing chunk 107... (rows: 100000)


Chunk 107: 100%|██████████| 1133/1133 [00:00<00:00, 19678.68it/s]


Processing chunk 108... (rows: 100000)


Chunk 108: 100%|██████████| 641/641 [00:00<00:00, 18159.98it/s]


Processing chunk 109... (rows: 100000)


Chunk 109: 100%|██████████| 966/966 [00:00<00:00, 19888.56it/s]


Processing chunk 110... (rows: 100000)


Chunk 110: 100%|██████████| 756/756 [00:00<00:00, 19850.96it/s]


Processing chunk 111... (rows: 100000)


Chunk 111: 100%|██████████| 930/930 [00:00<00:00, 20079.60it/s]


Processing chunk 112... (rows: 100000)


Chunk 112: 100%|██████████| 1388/1388 [00:00<00:00, 21583.81it/s]


Processing chunk 113... (rows: 100000)


Chunk 113: 100%|██████████| 1011/1011 [00:00<00:00, 21106.28it/s]


Processing chunk 114... (rows: 100000)


Chunk 114: 100%|██████████| 1526/1526 [00:00<00:00, 20927.57it/s]


Processing chunk 115... (rows: 100000)


Chunk 115: 100%|██████████| 589/589 [00:00<00:00, 19315.74it/s]


Processing chunk 116... (rows: 100000)


Chunk 116: 100%|██████████| 742/742 [00:00<00:00, 19848.05it/s]


Processing chunk 117... (rows: 100000)


Chunk 117: 100%|██████████| 594/594 [00:00<00:00, 18868.51it/s]


Processing chunk 118... (rows: 100000)


Chunk 118: 100%|██████████| 412/412 [00:00<00:00, 19831.45it/s]


Processing chunk 119... (rows: 100000)


Chunk 119: 100%|██████████| 913/913 [00:00<00:00, 18380.71it/s]


Processing chunk 120... (rows: 100000)


Chunk 120: 100%|██████████| 902/902 [00:00<00:00, 19392.49it/s]


Processing chunk 121... (rows: 100000)


Chunk 121: 100%|██████████| 624/624 [00:00<00:00, 20278.67it/s]


Processing chunk 122... (rows: 100000)


Chunk 122: 100%|██████████| 709/709 [00:00<00:00, 20322.16it/s]


Processing chunk 123... (rows: 100000)


Chunk 123: 100%|██████████| 883/883 [00:00<00:00, 20754.57it/s]


Processing chunk 124... (rows: 100000)


Chunk 124: 100%|██████████| 781/781 [00:00<00:00, 22225.66it/s]


Processing chunk 125... (rows: 100000)


Chunk 125: 100%|██████████| 905/905 [00:00<00:00, 19868.54it/s]


Processing chunk 126... (rows: 100000)


Chunk 126: 100%|██████████| 1018/1018 [00:00<00:00, 21120.48it/s]


Processing chunk 127... (rows: 100000)


Chunk 127: 100%|██████████| 1221/1221 [00:00<00:00, 20663.18it/s]


Processing chunk 128... (rows: 100000)


Chunk 128: 100%|██████████| 927/927 [00:00<00:00, 20731.22it/s]


Processing chunk 129... (rows: 100000)


Chunk 129: 100%|██████████| 1004/1004 [00:00<00:00, 20232.16it/s]


Processing chunk 130... (rows: 100000)


Chunk 130: 100%|██████████| 864/864 [00:00<00:00, 21409.62it/s]


Processing chunk 131... (rows: 100000)


Chunk 131: 100%|██████████| 718/718 [00:00<00:00, 18496.06it/s]


Processing chunk 132... (rows: 100000)


Chunk 132: 100%|██████████| 979/979 [00:00<00:00, 19833.67it/s]


Processing chunk 133... (rows: 100000)


Chunk 133: 100%|██████████| 815/815 [00:00<00:00, 20413.59it/s]


Processing chunk 134... (rows: 100000)


Chunk 134: 100%|██████████| 892/892 [00:00<00:00, 21479.74it/s]


Processing chunk 135... (rows: 100000)


Chunk 135: 100%|██████████| 915/915 [00:00<00:00, 20748.17it/s]


Processing chunk 136... (rows: 100000)


Chunk 136: 100%|██████████| 1001/1001 [00:00<00:00, 21531.20it/s]


Processing chunk 137... (rows: 100000)


Chunk 137: 100%|██████████| 924/924 [00:00<00:00, 20468.34it/s]


Processing chunk 138... (rows: 100000)


Chunk 138: 100%|██████████| 846/846 [00:00<00:00, 21122.32it/s]


Processing chunk 139... (rows: 100000)


Chunk 139: 100%|██████████| 1037/1037 [00:00<00:00, 19808.15it/s]


Processing chunk 140... (rows: 100000)


Chunk 140: 100%|██████████| 1053/1053 [00:00<00:00, 20729.19it/s]


Processing chunk 141... (rows: 100000)


Chunk 141: 100%|██████████| 758/758 [00:00<00:00, 20661.19it/s]


Processing chunk 142... (rows: 100000)


Chunk 142: 100%|██████████| 1158/1158 [00:00<00:00, 20613.37it/s]


Processing chunk 143... (rows: 100000)


Chunk 143: 100%|██████████| 910/910 [00:00<00:00, 20674.47it/s]


Processing chunk 144... (rows: 100000)


Chunk 144: 100%|██████████| 888/888 [00:00<00:00, 20403.64it/s]


Processing chunk 145... (rows: 100000)


Chunk 145: 100%|██████████| 775/775 [00:00<00:00, 19008.72it/s]


Processing chunk 146... (rows: 100000)


Chunk 146: 100%|██████████| 900/900 [00:00<00:00, 20456.47it/s]


Processing chunk 147... (rows: 100000)


Chunk 147: 100%|██████████| 644/644 [00:00<00:00, 18175.00it/s]


Processing chunk 148... (rows: 100000)


Chunk 148: 100%|██████████| 899/899 [00:00<00:00, 19961.56it/s]


Processing chunk 149... (rows: 100000)


Chunk 149: 100%|██████████| 1009/1009 [00:00<00:00, 19804.82it/s]


Processing chunk 150... (rows: 100000)


Chunk 150: 100%|██████████| 920/920 [00:00<00:00, 21559.00it/s]


Processing chunk 151... (rows: 100000)


Chunk 151: 100%|██████████| 1071/1071 [00:00<00:00, 20106.44it/s]


Processing chunk 152... (rows: 100000)


Chunk 152: 100%|██████████| 820/820 [00:00<00:00, 19983.32it/s]


Processing chunk 153... (rows: 100000)


Chunk 153: 100%|██████████| 1045/1045 [00:00<00:00, 20039.81it/s]


Processing chunk 154... (rows: 100000)


Chunk 154: 100%|██████████| 1026/1026 [00:00<00:00, 20535.88it/s]


Processing chunk 155... (rows: 100000)


Chunk 155: 100%|██████████| 775/775 [00:00<00:00, 18274.34it/s]


Processing chunk 156... (rows: 100000)


Chunk 156: 100%|██████████| 882/882 [00:00<00:00, 17887.80it/s]


Processing chunk 157... (rows: 100000)


Chunk 157: 100%|██████████| 826/826 [00:00<00:00, 20315.09it/s]


Processing chunk 158... (rows: 100000)


Chunk 158: 100%|██████████| 877/877 [00:00<00:00, 19642.25it/s]


Processing chunk 159... (rows: 100000)


Chunk 159: 100%|██████████| 1111/1111 [00:00<00:00, 21483.47it/s]


Processing chunk 160... (rows: 100000)


Chunk 160: 100%|██████████| 816/816 [00:00<00:00, 19353.62it/s]


Processing chunk 161... (rows: 100000)


Chunk 161: 100%|██████████| 1716/1716 [00:00<00:00, 20975.25it/s]


Processing chunk 162... (rows: 100000)


Chunk 162: 100%|██████████| 1466/1466 [00:00<00:00, 20968.59it/s]


Processing chunk 163... (rows: 100000)


Chunk 163: 100%|██████████| 934/934 [00:00<00:00, 21193.67it/s]


Processing chunk 164... (rows: 100000)


Chunk 164: 100%|██████████| 801/801 [00:00<00:00, 19215.28it/s]


Processing chunk 165... (rows: 100000)


Chunk 165: 100%|██████████| 982/982 [00:00<00:00, 19205.83it/s]


Processing chunk 166... (rows: 100000)


Chunk 166: 100%|██████████| 953/953 [00:00<00:00, 20842.59it/s]


Processing chunk 167... (rows: 100000)


Chunk 167: 100%|██████████| 1099/1099 [00:00<00:00, 20915.85it/s]


Processing chunk 168... (rows: 100000)


Chunk 168: 100%|██████████| 813/813 [00:00<00:00, 20417.14it/s]


Processing chunk 169... (rows: 100000)


Chunk 169: 100%|██████████| 818/818 [00:00<00:00, 20146.69it/s]


Processing chunk 170... (rows: 100000)


Chunk 170: 100%|██████████| 1040/1040 [00:00<00:00, 20870.48it/s]


Processing chunk 171... (rows: 100000)


Chunk 171: 100%|██████████| 965/965 [00:00<00:00, 19965.09it/s]


Processing chunk 172... (rows: 100000)


Chunk 172: 100%|██████████| 1113/1113 [00:00<00:00, 20928.27it/s]


Processing chunk 173... (rows: 100000)


Chunk 173: 100%|██████████| 920/920 [00:00<00:00, 19972.98it/s]


Processing chunk 174... (rows: 100000)


Chunk 174: 100%|██████████| 939/939 [00:00<00:00, 19996.20it/s]


Processing chunk 175... (rows: 100000)


Chunk 175: 100%|██████████| 884/884 [00:00<00:00, 20100.43it/s]


Processing chunk 176... (rows: 100000)


Chunk 176: 100%|██████████| 971/971 [00:00<00:00, 19882.29it/s]


Processing chunk 177... (rows: 100000)


Chunk 177: 100%|██████████| 806/806 [00:00<00:00, 17675.28it/s]


Processing chunk 178... (rows: 100000)


Chunk 178: 100%|██████████| 854/854 [00:00<00:00, 17493.85it/s]


Processing chunk 179... (rows: 100000)


Chunk 179: 100%|██████████| 1100/1100 [00:00<00:00, 21363.05it/s]


Processing chunk 180... (rows: 100000)


Chunk 180: 100%|██████████| 837/837 [00:00<00:00, 19843.27it/s]


Processing chunk 181... (rows: 100000)


Chunk 181: 100%|██████████| 1183/1183 [00:00<00:00, 20826.98it/s]


Processing chunk 182... (rows: 100000)


Chunk 182: 100%|██████████| 936/936 [00:00<00:00, 20187.22it/s]


Processing chunk 183... (rows: 100000)


Chunk 183: 100%|██████████| 1154/1154 [00:00<00:00, 20983.43it/s]


Processing chunk 184... (rows: 100000)


Chunk 184: 100%|██████████| 976/976 [00:00<00:00, 19837.66it/s]


Processing chunk 185... (rows: 100000)


Chunk 185: 100%|██████████| 877/877 [00:00<00:00, 20320.43it/s]


Processing chunk 186... (rows: 100000)


Chunk 186: 100%|██████████| 750/750 [00:00<00:00, 19121.57it/s]


Processing chunk 187... (rows: 100000)


Chunk 187: 100%|██████████| 1187/1187 [00:00<00:00, 18910.99it/s]


Processing chunk 188... (rows: 100000)


Chunk 188: 100%|██████████| 718/718 [00:00<00:00, 20417.99it/s]


Processing chunk 189... (rows: 100000)


Chunk 189: 100%|██████████| 883/883 [00:00<00:00, 20214.01it/s]


Processing chunk 190... (rows: 100000)


Chunk 190: 100%|██████████| 729/729 [00:00<00:00, 20101.03it/s]


Processing chunk 191... (rows: 100000)


Chunk 191: 100%|██████████| 954/954 [00:00<00:00, 21059.93it/s]


Processing chunk 192... (rows: 100000)


Chunk 192: 100%|██████████| 848/848 [00:00<00:00, 21418.32it/s]


Processing chunk 193... (rows: 100000)


Chunk 193: 100%|██████████| 1049/1049 [00:00<00:00, 20828.56it/s]


Processing chunk 194... (rows: 100000)


Chunk 194: 100%|██████████| 518/518 [00:00<00:00, 20482.59it/s]


Processing chunk 195... (rows: 100000)


Chunk 195: 100%|██████████| 973/973 [00:00<00:00, 19298.70it/s]


Processing chunk 196... (rows: 100000)


Chunk 196: 100%|██████████| 642/642 [00:00<00:00, 19146.90it/s]


Processing chunk 197... (rows: 100000)


Chunk 197: 100%|██████████| 878/878 [00:00<00:00, 19706.53it/s]


Processing chunk 198... (rows: 100000)


Chunk 198: 100%|██████████| 821/821 [00:00<00:00, 18608.11it/s]


Processing chunk 199... (rows: 100000)


Chunk 199: 100%|██████████| 1037/1037 [00:00<00:00, 20379.40it/s]


Processing chunk 200... (rows: 100000)


Chunk 200: 100%|██████████| 1034/1034 [00:00<00:00, 19999.96it/s]


Processing chunk 201... (rows: 100000)


Chunk 201: 100%|██████████| 1153/1153 [00:00<00:00, 21089.77it/s]


Processing chunk 202... (rows: 100000)


Chunk 202: 100%|██████████| 725/725 [00:00<00:00, 20199.88it/s]


Processing chunk 203... (rows: 100000)


Chunk 203: 100%|██████████| 865/865 [00:00<00:00, 20346.88it/s]


Processing chunk 204... (rows: 100000)


Chunk 204: 100%|██████████| 959/959 [00:00<00:00, 18857.30it/s]


Processing chunk 205... (rows: 100000)


Chunk 205: 100%|██████████| 810/810 [00:00<00:00, 19703.22it/s]


Processing chunk 206... (rows: 100000)


Chunk 206: 100%|██████████| 978/978 [00:00<00:00, 18853.84it/s]


Processing chunk 207... (rows: 100000)


Chunk 207: 100%|██████████| 1102/1102 [00:00<00:00, 19972.01it/s]


Processing chunk 208... (rows: 100000)


Chunk 208: 100%|██████████| 939/939 [00:00<00:00, 20883.89it/s]


Processing chunk 209... (rows: 100000)


Chunk 209: 100%|██████████| 947/947 [00:00<00:00, 21447.70it/s]


Processing chunk 210... (rows: 100000)


Chunk 210: 100%|██████████| 1064/1064 [00:00<00:00, 20897.48it/s]


Processing chunk 211... (rows: 100000)


Chunk 211: 100%|██████████| 906/906 [00:00<00:00, 20473.80it/s]


Processing chunk 212... (rows: 100000)


Chunk 212: 100%|██████████| 775/775 [00:00<00:00, 20391.48it/s]


Processing chunk 213... (rows: 100000)


Chunk 213: 100%|██████████| 886/886 [00:00<00:00, 21175.63it/s]


Processing chunk 214... (rows: 100000)


Chunk 214: 100%|██████████| 933/933 [00:00<00:00, 12408.91it/s]


Processing chunk 215... (rows: 100000)


Chunk 215: 100%|██████████| 875/875 [00:00<00:00, 18453.51it/s]


Processing chunk 216... (rows: 100000)


Chunk 216: 100%|██████████| 964/964 [00:00<00:00, 20869.12it/s]


Processing chunk 217... (rows: 100000)


Chunk 217: 100%|██████████| 793/793 [00:00<00:00, 19650.38it/s]


Processing chunk 218... (rows: 100000)


Chunk 218: 100%|██████████| 956/956 [00:00<00:00, 16215.05it/s]


Processing chunk 219... (rows: 100000)


Chunk 219: 100%|██████████| 926/926 [00:00<00:00, 20440.42it/s]


Processing chunk 220... (rows: 100000)


Chunk 220: 100%|██████████| 841/841 [00:00<00:00, 19092.98it/s]


Processing chunk 221... (rows: 100000)


Chunk 221: 100%|██████████| 908/908 [00:00<00:00, 20081.67it/s]


Processing chunk 222... (rows: 100000)


Chunk 222: 100%|██████████| 773/773 [00:00<00:00, 20609.45it/s]


Processing chunk 223... (rows: 100000)


Chunk 223: 100%|██████████| 958/958 [00:00<00:00, 21191.40it/s]


Processing chunk 224... (rows: 100000)


Chunk 224: 100%|██████████| 1098/1098 [00:00<00:00, 21010.75it/s]


Processing chunk 225... (rows: 100000)


Chunk 225: 100%|██████████| 1139/1139 [00:00<00:00, 21217.22it/s]


Processing chunk 226... (rows: 100000)


Chunk 226: 100%|██████████| 867/867 [00:00<00:00, 21055.55it/s]


Processing chunk 227... (rows: 100000)


Chunk 227: 100%|██████████| 1045/1045 [00:00<00:00, 21574.47it/s]


Processing chunk 228... (rows: 100000)


Chunk 228: 100%|██████████| 1058/1058 [00:00<00:00, 20899.71it/s]


Processing chunk 229... (rows: 100000)


Chunk 229: 100%|██████████| 1323/1323 [00:00<00:00, 20498.64it/s]


Processing chunk 230... (rows: 100000)


Chunk 230: 100%|██████████| 851/851 [00:00<00:00, 19089.80it/s]


Processing chunk 231... (rows: 100000)


Chunk 231: 100%|██████████| 656/656 [00:00<00:00, 19511.99it/s]


Processing chunk 232... (rows: 100000)


Chunk 232: 100%|██████████| 942/942 [00:00<00:00, 18449.85it/s]


Processing chunk 233... (rows: 100000)


Chunk 233: 100%|██████████| 668/668 [00:00<00:00, 20969.64it/s]


Processing chunk 234... (rows: 100000)


Chunk 234: 100%|██████████| 1042/1042 [00:00<00:00, 21724.26it/s]


Processing chunk 235... (rows: 100000)


Chunk 235: 100%|██████████| 934/934 [00:00<00:00, 20457.88it/s]


Processing chunk 236... (rows: 100000)


Chunk 236: 100%|██████████| 873/873 [00:00<00:00, 20502.29it/s]


Processing chunk 237... (rows: 100000)


Chunk 237: 100%|██████████| 974/974 [00:00<00:00, 20628.21it/s]


Processing chunk 238... (rows: 100000)


Chunk 238: 100%|██████████| 1108/1108 [00:00<00:00, 20828.28it/s]


Processing chunk 239... (rows: 100000)


Chunk 239: 100%|██████████| 1063/1063 [00:00<00:00, 20523.59it/s]


Processing chunk 240... (rows: 100000)


Chunk 240: 100%|██████████| 1365/1365 [00:00<00:00, 21715.58it/s]


Processing chunk 241... (rows: 100000)


Chunk 241: 100%|██████████| 1136/1136 [00:00<00:00, 21365.16it/s]


Processing chunk 242... (rows: 100000)


Chunk 242: 100%|██████████| 1160/1160 [00:00<00:00, 21081.47it/s]


Processing chunk 243... (rows: 100000)


Chunk 243: 100%|██████████| 931/931 [00:00<00:00, 20145.68it/s]


Processing chunk 244... (rows: 100000)


Chunk 244: 100%|██████████| 1360/1360 [00:00<00:00, 20968.67it/s]


Processing chunk 245... (rows: 100000)


Chunk 245: 100%|██████████| 1124/1124 [00:00<00:00, 20755.20it/s]


Processing chunk 246... (rows: 100000)


Chunk 246: 100%|██████████| 998/998 [00:00<00:00, 19379.87it/s]


Processing chunk 247... (rows: 100000)


Chunk 247: 100%|██████████| 943/943 [00:00<00:00, 20635.62it/s]


Processing chunk 248... (rows: 100000)


Chunk 248: 100%|██████████| 1072/1072 [00:00<00:00, 20580.64it/s]


Processing chunk 249... (rows: 100000)


Chunk 249: 100%|██████████| 1234/1234 [00:00<00:00, 21609.02it/s]


Processing chunk 250... (rows: 100000)


Chunk 250: 100%|██████████| 934/934 [00:00<00:00, 20648.85it/s]


Processing chunk 251... (rows: 100000)


Chunk 251: 100%|██████████| 738/738 [00:00<00:00, 20358.82it/s]


Processing chunk 252... (rows: 100000)


Chunk 252: 100%|██████████| 1297/1297 [00:00<00:00, 20503.13it/s]


Processing chunk 253... (rows: 100000)


Chunk 253: 100%|██████████| 843/843 [00:00<00:00, 17363.50it/s]


Processing chunk 254... (rows: 100000)


Chunk 254: 100%|██████████| 1197/1197 [00:00<00:00, 21119.46it/s]


Processing chunk 255... (rows: 100000)


Chunk 255: 100%|██████████| 779/779 [00:00<00:00, 19621.56it/s]


Processing chunk 256... (rows: 100000)


Chunk 256: 100%|██████████| 1215/1215 [00:00<00:00, 21451.85it/s]


Processing chunk 257... (rows: 100000)


Chunk 257: 100%|██████████| 1359/1359 [00:00<00:00, 20831.49it/s]


Processing chunk 258... (rows: 100000)


Chunk 258: 100%|██████████| 1625/1625 [00:00<00:00, 21613.27it/s]


Processing chunk 259... (rows: 100000)


Chunk 259: 100%|██████████| 1437/1437 [00:00<00:00, 21940.60it/s]


Processing chunk 260... (rows: 100000)


Chunk 260: 100%|██████████| 1475/1475 [00:00<00:00, 21833.23it/s]


Processing chunk 261... (rows: 100000)


Chunk 261: 100%|██████████| 2186/2186 [00:00<00:00, 22306.44it/s]


Processing chunk 262... (rows: 100000)


Chunk 262: 100%|██████████| 1752/1752 [00:00<00:00, 21478.57it/s]


Processing chunk 263... (rows: 100000)


Chunk 263: 100%|██████████| 1291/1291 [00:00<00:00, 21701.92it/s]


Processing chunk 264... (rows: 100000)


Chunk 264: 100%|██████████| 2216/2216 [00:00<00:00, 22395.17it/s]


Processing chunk 265... (rows: 100000)


Chunk 265: 100%|██████████| 2500/2500 [00:00<00:00, 21887.01it/s]


Processing chunk 266... (rows: 100000)


Chunk 266: 100%|██████████| 1684/1684 [00:00<00:00, 21767.84it/s]


Processing chunk 267... (rows: 100000)


Chunk 267: 100%|██████████| 1645/1645 [00:00<00:00, 21450.01it/s]


Processing chunk 268... (rows: 100000)


Chunk 268: 100%|██████████| 1091/1091 [00:00<00:00, 21856.08it/s]


Processing chunk 269... (rows: 100000)


Chunk 269: 100%|██████████| 1450/1450 [00:00<00:00, 21674.75it/s]


Processing chunk 270... (rows: 100000)


Chunk 270: 100%|██████████| 1491/1491 [00:00<00:00, 21488.19it/s]


Processing chunk 271... (rows: 100000)


Chunk 271: 100%|██████████| 1365/1365 [00:00<00:00, 21321.89it/s]


Processing chunk 272... (rows: 100000)


Chunk 272: 100%|██████████| 1541/1541 [00:00<00:00, 21589.22it/s]


Processing chunk 273... (rows: 100000)


Chunk 273: 100%|██████████| 1426/1426 [00:00<00:00, 16244.67it/s]


Processing chunk 274... (rows: 100000)


Chunk 274: 100%|██████████| 1439/1439 [00:00<00:00, 20181.78it/s]


Processing chunk 275... (rows: 100000)


Chunk 275: 100%|██████████| 1330/1330 [00:00<00:00, 20206.85it/s]


Processing chunk 276... (rows: 100000)


Chunk 276: 100%|██████████| 1359/1359 [00:00<00:00, 20194.00it/s]


Processing chunk 277... (rows: 100000)


Chunk 277: 100%|██████████| 1183/1183 [00:00<00:00, 20145.11it/s]


Processing chunk 278... (rows: 100000)


Chunk 278: 100%|██████████| 1453/1453 [00:00<00:00, 21777.34it/s]


Processing chunk 279... (rows: 100000)


Chunk 279: 100%|██████████| 1120/1120 [00:00<00:00, 14149.50it/s]


Processing chunk 280... (rows: 100000)


Chunk 280: 100%|██████████| 1156/1156 [00:00<00:00, 21359.82it/s]


Processing chunk 281... (rows: 100000)


Chunk 281: 100%|██████████| 1078/1078 [00:00<00:00, 21101.30it/s]


Processing chunk 282... (rows: 100000)


Chunk 282: 100%|██████████| 1672/1672 [00:00<00:00, 21817.81it/s]


Processing chunk 283... (rows: 100000)


Chunk 283: 100%|██████████| 891/891 [00:00<00:00, 20671.20it/s]


Processing chunk 284... (rows: 100000)


Chunk 284: 100%|██████████| 1182/1182 [00:00<00:00, 21405.15it/s]


Processing chunk 285... (rows: 100000)


Chunk 285: 100%|██████████| 1105/1105 [00:00<00:00, 21199.44it/s]


Processing chunk 286... (rows: 100000)


Chunk 286: 100%|██████████| 1169/1169 [00:00<00:00, 20736.48it/s]


Processing chunk 287... (rows: 100000)


Chunk 287: 100%|██████████| 1261/1261 [00:00<00:00, 20476.89it/s]


Processing chunk 288... (rows: 100000)


Chunk 288: 100%|██████████| 1549/1549 [00:00<00:00, 21304.85it/s]


Processing chunk 289... (rows: 100000)


Chunk 289: 100%|██████████| 1325/1325 [00:00<00:00, 20383.70it/s]


Processing chunk 290... (rows: 100000)


Chunk 290: 100%|██████████| 1544/1544 [00:00<00:00, 19946.06it/s]


Processing chunk 291... (rows: 100000)


Chunk 291: 100%|██████████| 1250/1250 [00:00<00:00, 21441.96it/s]


Processing chunk 292... (rows: 100000)


Chunk 292: 100%|██████████| 886/886 [00:00<00:00, 19850.50it/s]


Processing chunk 293... (rows: 100000)


Chunk 293: 100%|██████████| 1324/1324 [00:00<00:00, 21752.15it/s]


Processing chunk 294... (rows: 100000)


Chunk 294: 100%|██████████| 1488/1488 [00:00<00:00, 19253.40it/s]


Processing chunk 295... (rows: 100000)


Chunk 295: 100%|██████████| 1533/1533 [00:00<00:00, 21871.86it/s]


Processing chunk 296... (rows: 100000)


Chunk 296: 100%|██████████| 1371/1371 [00:00<00:00, 20759.08it/s]


Processing chunk 297... (rows: 100000)


Chunk 297: 100%|██████████| 1028/1028 [00:00<00:00, 18530.23it/s]


Processing chunk 298... (rows: 100000)


Chunk 298: 100%|██████████| 1367/1367 [00:00<00:00, 21215.33it/s]


Processing chunk 299... (rows: 100000)


Chunk 299: 100%|██████████| 1392/1392 [00:00<00:00, 21479.89it/s]


Processing chunk 300... (rows: 100000)


Chunk 300: 100%|██████████| 1219/1219 [00:00<00:00, 21086.99it/s]


Processing chunk 301... (rows: 100000)


Chunk 301: 100%|██████████| 1103/1103 [00:00<00:00, 20880.56it/s]


Processing chunk 302... (rows: 100000)


Chunk 302: 100%|██████████| 1114/1114 [00:00<00:00, 20114.83it/s]


Processing chunk 303... (rows: 100000)


Chunk 303: 100%|██████████| 975/975 [00:00<00:00, 21836.47it/s]


Processing chunk 304... (rows: 100000)


Chunk 304: 100%|██████████| 1286/1286 [00:00<00:00, 21350.04it/s]


Processing chunk 305... (rows: 100000)


Chunk 305: 100%|██████████| 901/901 [00:00<00:00, 20732.10it/s]


Processing chunk 306... (rows: 100000)


Chunk 306: 100%|██████████| 1123/1123 [00:00<00:00, 18470.66it/s]


Processing chunk 307... (rows: 100000)


Chunk 307: 100%|██████████| 891/891 [00:00<00:00, 19463.89it/s]


Processing chunk 308... (rows: 100000)


Chunk 308: 100%|██████████| 1201/1201 [00:00<00:00, 21024.21it/s]


Processing chunk 309... (rows: 100000)


Chunk 309: 100%|██████████| 1134/1134 [00:00<00:00, 20585.85it/s]


Processing chunk 310... (rows: 100000)


Chunk 310: 100%|██████████| 1399/1399 [00:00<00:00, 20884.41it/s]


Processing chunk 311... (rows: 100000)


Chunk 311: 100%|██████████| 1292/1292 [00:00<00:00, 20819.32it/s]


Processing chunk 312... (rows: 100000)


Chunk 312: 100%|██████████| 967/967 [00:00<00:00, 20019.61it/s]


Processing chunk 313... (rows: 100000)


Chunk 313: 100%|██████████| 1324/1324 [00:00<00:00, 21339.07it/s]


Processing chunk 314... (rows: 100000)


Chunk 314: 100%|██████████| 1043/1043 [00:00<00:00, 20986.51it/s]


Processing chunk 315... (rows: 100000)


Chunk 315: 100%|██████████| 984/984 [00:00<00:00, 20251.20it/s]


Processing chunk 316... (rows: 100000)


Chunk 316: 100%|██████████| 1278/1278 [00:00<00:00, 20306.47it/s]


Processing chunk 317... (rows: 100000)


Chunk 317: 100%|██████████| 913/913 [00:00<00:00, 19398.60it/s]


Processing chunk 318... (rows: 100000)


Chunk 318: 100%|██████████| 1356/1356 [00:00<00:00, 20855.40it/s]


Processing chunk 319... (rows: 100000)


Chunk 319: 100%|██████████| 1060/1060 [00:00<00:00, 20809.95it/s]


Processing chunk 320... (rows: 100000)


Chunk 320: 100%|██████████| 1327/1327 [00:00<00:00, 21042.40it/s]


Processing chunk 321... (rows: 100000)


Chunk 321: 100%|██████████| 1088/1088 [00:00<00:00, 20352.16it/s]


Processing chunk 322... (rows: 100000)


Chunk 322: 100%|██████████| 966/966 [00:00<00:00, 17110.57it/s]


Processing chunk 323... (rows: 100000)


Chunk 323: 100%|██████████| 841/841 [00:00<00:00, 12886.93it/s]


Processing chunk 324... (rows: 100000)


Chunk 324: 100%|██████████| 939/939 [00:00<00:00, 3594.50it/s]


Processing chunk 325... (rows: 100000)


Chunk 325: 100%|██████████| 1174/1174 [00:00<00:00, 20621.10it/s]


Processing chunk 326... (rows: 100000)


Chunk 326: 100%|██████████| 965/965 [00:00<00:00, 19226.58it/s]


Processing chunk 327... (rows: 100000)


Chunk 327: 100%|██████████| 971/971 [00:00<00:00, 21060.66it/s]


Processing chunk 328... (rows: 100000)


Chunk 328: 100%|██████████| 1359/1359 [00:00<00:00, 20478.32it/s]


Processing chunk 329... (rows: 100000)


Chunk 329: 100%|██████████| 947/947 [00:00<00:00, 19373.94it/s]


Processing chunk 330... (rows: 100000)


Chunk 330: 100%|██████████| 1388/1388 [00:00<00:00, 21691.09it/s]


Processing chunk 331... (rows: 100000)


Chunk 331: 100%|██████████| 1255/1255 [00:00<00:00, 21209.13it/s]


Processing chunk 332... (rows: 100000)


Chunk 332: 100%|██████████| 1471/1471 [00:00<00:00, 21386.23it/s]


Processing chunk 333... (rows: 100000)


Chunk 333: 100%|██████████| 1607/1607 [00:00<00:00, 20322.27it/s]


Processing chunk 334... (rows: 100000)


Chunk 334: 100%|██████████| 1139/1139 [00:00<00:00, 19408.53it/s]


Processing chunk 335... (rows: 100000)


Chunk 335: 100%|██████████| 1285/1285 [00:00<00:00, 19896.42it/s]


Processing chunk 336... (rows: 100000)


Chunk 336: 100%|██████████| 1128/1128 [00:00<00:00, 19784.95it/s]


Processing chunk 337... (rows: 100000)


Chunk 337: 100%|██████████| 1265/1265 [00:00<00:00, 20328.17it/s]


Processing chunk 338... (rows: 100000)


Chunk 338: 100%|██████████| 1252/1252 [00:00<00:00, 19236.05it/s]


Processing chunk 339... (rows: 100000)


Chunk 339: 100%|██████████| 938/938 [00:00<00:00, 17633.87it/s]


Processing chunk 340... (rows: 100000)


Chunk 340: 100%|██████████| 1327/1327 [00:00<00:00, 21196.26it/s]


Processing chunk 341... (rows: 100000)


Chunk 341: 100%|██████████| 1137/1137 [00:00<00:00, 21648.29it/s]


Processing chunk 342... (rows: 100000)


Chunk 342: 100%|██████████| 1541/1541 [00:00<00:00, 21508.54it/s]


Processing chunk 343... (rows: 100000)


Chunk 343: 100%|██████████| 1169/1169 [00:00<00:00, 21398.86it/s]


Processing chunk 344... (rows: 100000)


Chunk 344: 100%|██████████| 1658/1658 [00:00<00:00, 21267.05it/s]


Processing chunk 345... (rows: 100000)


Chunk 345: 100%|██████████| 1328/1328 [00:00<00:00, 21174.74it/s]


Processing chunk 346... (rows: 100000)


Chunk 346: 100%|██████████| 1504/1504 [00:00<00:00, 21804.03it/s]


Processing chunk 347... (rows: 100000)


Chunk 347: 100%|██████████| 962/962 [00:00<00:00, 22778.66it/s]


Processing chunk 348... (rows: 100000)


Chunk 348: 100%|██████████| 973/973 [00:00<00:00, 20392.75it/s]


Processing chunk 349... (rows: 100000)


Chunk 349: 100%|██████████| 1362/1362 [00:00<00:00, 21814.72it/s]


Processing chunk 350... (rows: 100000)


Chunk 350: 100%|██████████| 1400/1400 [00:00<00:00, 21668.23it/s]


Processing chunk 351... (rows: 100000)


Chunk 351: 100%|██████████| 1346/1346 [00:00<00:00, 20286.37it/s]


Processing chunk 352... (rows: 100000)


Chunk 352: 100%|██████████| 1098/1098 [00:00<00:00, 20495.17it/s]


Processing chunk 353... (rows: 100000)


Chunk 353: 100%|██████████| 1029/1029 [00:00<00:00, 21294.35it/s]


Processing chunk 354... (rows: 100000)


Chunk 354: 100%|██████████| 963/963 [00:00<00:00, 20168.95it/s]


Processing chunk 355... (rows: 100000)


Chunk 355: 100%|██████████| 778/778 [00:00<00:00, 20373.54it/s]


Processing chunk 356... (rows: 100000)


Chunk 356: 100%|██████████| 1297/1297 [00:00<00:00, 21658.26it/s]


Processing chunk 357... (rows: 100000)


Chunk 357: 100%|██████████| 2270/2270 [00:00<00:00, 22518.45it/s]


Processing chunk 358... (rows: 100000)


Chunk 358: 100%|██████████| 1193/1193 [00:00<00:00, 17675.67it/s]


Processing chunk 359... (rows: 100000)


Chunk 359: 100%|██████████| 1136/1136 [00:00<00:00, 20505.10it/s]


Processing chunk 360... (rows: 100000)


Chunk 360: 100%|██████████| 1998/1998 [00:00<00:00, 22080.23it/s]


Processing chunk 361... (rows: 100000)


Chunk 361: 100%|██████████| 1402/1402 [00:00<00:00, 21586.31it/s]


Processing chunk 362... (rows: 100000)


Chunk 362: 100%|██████████| 1436/1436 [00:00<00:00, 21706.53it/s]


Processing chunk 363... (rows: 100000)


Chunk 363: 100%|██████████| 1291/1291 [00:00<00:00, 22043.74it/s]


Processing chunk 364... (rows: 100000)


Chunk 364: 100%|██████████| 1458/1458 [00:00<00:00, 21482.05it/s]


Processing chunk 365... (rows: 100000)


Chunk 365: 100%|██████████| 1206/1206 [00:00<00:00, 19641.33it/s]


Processing chunk 366... (rows: 100000)


Chunk 366: 100%|██████████| 1179/1179 [00:00<00:00, 21756.34it/s]


Processing chunk 367... (rows: 100000)


Chunk 367: 100%|██████████| 1001/1001 [00:00<00:00, 19908.66it/s]


Processing chunk 368... (rows: 100000)


Chunk 368: 100%|██████████| 914/914 [00:00<00:00, 17783.52it/s]


Processing chunk 369... (rows: 100000)


Chunk 369: 100%|██████████| 1330/1330 [00:00<00:00, 21875.49it/s]


Processing chunk 370... (rows: 100000)


Chunk 370: 100%|██████████| 923/923 [00:00<00:00, 20567.20it/s]


Processing chunk 371... (rows: 100000)


Chunk 371: 100%|██████████| 958/958 [00:00<00:00, 20715.82it/s]


Processing chunk 372... (rows: 100000)


Chunk 372: 100%|██████████| 927/927 [00:00<00:00, 19931.10it/s]


Processing chunk 373... (rows: 100000)


Chunk 373: 100%|██████████| 995/995 [00:00<00:00, 20384.97it/s]


Processing chunk 374... (rows: 100000)


Chunk 374: 100%|██████████| 1005/1005 [00:00<00:00, 18577.76it/s]


Processing chunk 375... (rows: 100000)


Chunk 375: 100%|██████████| 1367/1367 [00:00<00:00, 21215.25it/s]


Processing chunk 376... (rows: 100000)


Chunk 376: 100%|██████████| 1157/1157 [00:00<00:00, 21192.05it/s]


Processing chunk 377... (rows: 100000)


Chunk 377: 100%|██████████| 971/971 [00:00<00:00, 21632.30it/s]


Processing chunk 378... (rows: 100000)


Chunk 378: 100%|██████████| 1818/1818 [00:00<00:00, 22403.53it/s]


Processing chunk 379... (rows: 100000)


Chunk 379: 100%|██████████| 1438/1438 [00:00<00:00, 21984.84it/s]


Processing chunk 380... (rows: 100000)


Chunk 380: 100%|██████████| 1226/1226 [00:00<00:00, 21443.68it/s]


Processing chunk 381... (rows: 100000)


Chunk 381: 100%|██████████| 1259/1259 [00:00<00:00, 21166.88it/s]


Processing chunk 382... (rows: 100000)


Chunk 382: 100%|██████████| 1683/1683 [00:00<00:00, 20978.81it/s]


Processing chunk 383... (rows: 100000)


Chunk 383: 100%|██████████| 1206/1206 [00:00<00:00, 21531.41it/s]


Processing chunk 384... (rows: 100000)


Chunk 384: 100%|██████████| 1567/1567 [00:00<00:00, 21866.99it/s]


Processing chunk 385... (rows: 100000)


Chunk 385: 100%|██████████| 1329/1329 [00:00<00:00, 20369.11it/s]


Processing chunk 386... (rows: 100000)


Chunk 386: 100%|██████████| 1588/1588 [00:00<00:00, 21079.91it/s]


Processing chunk 387... (rows: 100000)


Chunk 387: 100%|██████████| 1722/1722 [00:00<00:00, 21923.25it/s]


Processing chunk 388... (rows: 100000)


Chunk 388: 100%|██████████| 1752/1752 [00:00<00:00, 21380.46it/s]


Processing chunk 389... (rows: 100000)


Chunk 389: 100%|██████████| 1795/1795 [00:00<00:00, 21921.15it/s]


Processing chunk 390... (rows: 100000)


Chunk 390: 100%|██████████| 1764/1764 [00:00<00:00, 21655.81it/s]


Processing chunk 391... (rows: 100000)


Chunk 391: 100%|██████████| 824/824 [00:00<00:00, 20969.61it/s]


Processing chunk 392... (rows: 100000)


Chunk 392: 100%|██████████| 2560/2560 [00:00<00:00, 21261.67it/s]


Processing chunk 393... (rows: 100000)


Chunk 393: 100%|██████████| 1650/1650 [00:00<00:00, 22209.32it/s]


Processing chunk 394... (rows: 100000)


Chunk 394: 100%|██████████| 1932/1932 [00:00<00:00, 19950.94it/s]


Processing chunk 395... (rows: 100000)


Chunk 395: 100%|██████████| 1726/1726 [00:00<00:00, 20910.58it/s]


Processing chunk 396... (rows: 100000)


Chunk 396: 100%|██████████| 1424/1424 [00:00<00:00, 20910.51it/s]


Processing chunk 397... (rows: 100000)


Chunk 397: 100%|██████████| 1283/1283 [00:00<00:00, 21509.60it/s]


Processing chunk 398... (rows: 100000)


Chunk 398: 100%|██████████| 1598/1598 [00:00<00:00, 21355.33it/s]


Processing chunk 399... (rows: 100000)


Chunk 399: 100%|██████████| 1525/1525 [00:00<00:00, 21028.33it/s]


Processing chunk 400... (rows: 100000)


Chunk 400: 100%|██████████| 1416/1416 [00:00<00:00, 20990.94it/s]


Processing chunk 401... (rows: 100000)


Chunk 401: 100%|██████████| 1347/1347 [00:00<00:00, 18232.16it/s]


Processing chunk 402... (rows: 100000)


Chunk 402: 100%|██████████| 1199/1199 [00:00<00:00, 19582.68it/s]


Processing chunk 403... (rows: 100000)


Chunk 403: 100%|██████████| 1447/1447 [00:00<00:00, 19823.94it/s]


Processing chunk 404... (rows: 100000)


Chunk 404: 100%|██████████| 1220/1220 [00:00<00:00, 20210.00it/s]


Processing chunk 405... (rows: 100000)


Chunk 405: 100%|██████████| 1331/1331 [00:00<00:00, 21175.16it/s]


Processing chunk 406... (rows: 100000)


Chunk 406: 100%|██████████| 1855/1855 [00:00<00:00, 21281.97it/s]


Processing chunk 407... (rows: 100000)


Chunk 407: 100%|██████████| 1559/1559 [00:00<00:00, 20225.36it/s]


Processing chunk 408... (rows: 100000)


Chunk 408: 100%|██████████| 1349/1349 [00:00<00:00, 19633.76it/s]


Processing chunk 409... (rows: 100000)


Chunk 409: 100%|██████████| 1586/1586 [00:00<00:00, 20863.00it/s]


Processing chunk 410... (rows: 100000)


Chunk 410: 100%|██████████| 1743/1743 [00:00<00:00, 21791.61it/s]


Processing chunk 411... (rows: 100000)


Chunk 411: 100%|██████████| 1444/1444 [00:00<00:00, 21286.84it/s]


Processing chunk 412... (rows: 100000)


Chunk 412: 100%|██████████| 1092/1092 [00:00<00:00, 20713.64it/s]


Processing chunk 413... (rows: 100000)


Chunk 413: 100%|██████████| 886/886 [00:00<00:00, 19926.82it/s]


Processing chunk 414... (rows: 100000)


Chunk 414: 100%|██████████| 994/994 [00:00<00:00, 20591.59it/s]


Processing chunk 415... (rows: 100000)


Chunk 415: 100%|██████████| 1168/1168 [00:00<00:00, 20922.26it/s]


Processing chunk 416... (rows: 100000)


Chunk 416: 100%|██████████| 1503/1503 [00:00<00:00, 20850.41it/s]


Processing chunk 417... (rows: 100000)


Chunk 417: 100%|██████████| 1680/1680 [00:00<00:00, 22087.95it/s]


Processing chunk 418... (rows: 100000)


Chunk 418: 100%|██████████| 1705/1705 [00:00<00:00, 20501.67it/s]


Processing chunk 419... (rows: 100000)


Chunk 419: 100%|██████████| 903/903 [00:00<00:00, 20801.29it/s]


Processing chunk 420... (rows: 100000)


Chunk 420: 100%|██████████| 1387/1387 [00:00<00:00, 21040.85it/s]


Processing chunk 421... (rows: 100000)


Chunk 421: 100%|██████████| 1600/1600 [00:00<00:00, 20908.01it/s]


Processing chunk 422... (rows: 100000)


Chunk 422: 100%|██████████| 1718/1718 [00:00<00:00, 21258.72it/s]


Processing chunk 423... (rows: 100000)


Chunk 423: 100%|██████████| 868/868 [00:00<00:00, 20056.39it/s]


Processing chunk 424... (rows: 100000)


Chunk 424: 100%|██████████| 1064/1064 [00:00<00:00, 20976.84it/s]


Processing chunk 425... (rows: 100000)


Chunk 425: 100%|██████████| 1958/1958 [00:00<00:00, 20497.50it/s]


Processing chunk 426... (rows: 100000)


Chunk 426: 100%|██████████| 1379/1379 [00:00<00:00, 20485.31it/s]


Processing chunk 427... (rows: 100000)


Chunk 427: 100%|██████████| 1356/1356 [00:00<00:00, 20373.17it/s]


Processing chunk 428... (rows: 100000)


Chunk 428: 100%|██████████| 1717/1717 [00:00<00:00, 20619.42it/s]


Processing chunk 429... (rows: 100000)


Chunk 429: 100%|██████████| 1790/1790 [00:00<00:00, 21994.19it/s]


Processing chunk 430... (rows: 100000)


Chunk 430: 100%|██████████| 1300/1300 [00:00<00:00, 20718.75it/s]


Processing chunk 431... (rows: 100000)


Chunk 431: 100%|██████████| 1209/1209 [00:00<00:00, 20287.39it/s]


Processing chunk 432... (rows: 100000)


Chunk 432: 100%|██████████| 1155/1155 [00:00<00:00, 18103.35it/s]


Processing chunk 433... (rows: 100000)


Chunk 433: 100%|██████████| 1417/1417 [00:00<00:00, 21171.13it/s]


Processing chunk 434... (rows: 100000)


Chunk 434: 100%|██████████| 1443/1443 [00:00<00:00, 21591.66it/s]


Processing chunk 435... (rows: 100000)


Chunk 435: 100%|██████████| 1400/1400 [00:00<00:00, 20468.08it/s]


Processing chunk 436... (rows: 100000)


Chunk 436: 100%|██████████| 1389/1389 [00:00<00:00, 20326.96it/s]


Processing chunk 437... (rows: 100000)


Chunk 437: 100%|██████████| 1598/1598 [00:00<00:00, 21151.53it/s]


Processing chunk 438... (rows: 100000)


Chunk 438: 100%|██████████| 1669/1669 [00:00<00:00, 20943.22it/s]


Processing chunk 439... (rows: 100000)


Chunk 439: 100%|██████████| 1490/1490 [00:00<00:00, 20666.79it/s]


Processing chunk 440... (rows: 100000)


Chunk 440: 100%|██████████| 1600/1600 [00:00<00:00, 21261.68it/s]


Processing chunk 441... (rows: 100000)


Chunk 441: 100%|██████████| 1134/1134 [00:00<00:00, 21113.58it/s]


Processing chunk 442... (rows: 100000)


Chunk 442: 100%|██████████| 1914/1914 [00:00<00:00, 21233.22it/s]


Processing chunk 443... (rows: 100000)


Chunk 443: 100%|██████████| 1478/1478 [00:00<00:00, 20551.86it/s]


Processing chunk 444... (rows: 100000)


Chunk 444: 100%|██████████| 934/934 [00:00<00:00, 19972.37it/s]


Processing chunk 445... (rows: 100000)


Chunk 445: 100%|██████████| 1249/1249 [00:00<00:00, 21101.69it/s]


Processing chunk 446... (rows: 100000)


Chunk 446: 100%|██████████| 1077/1077 [00:00<00:00, 19903.35it/s]


Processing chunk 447... (rows: 100000)


Chunk 447: 100%|██████████| 1707/1707 [00:00<00:00, 21329.89it/s]


Processing chunk 448... (rows: 100000)


Chunk 448: 100%|██████████| 1427/1427 [00:00<00:00, 21853.71it/s]


Processing chunk 449... (rows: 100000)


Chunk 449: 100%|██████████| 1117/1117 [00:00<00:00, 20708.08it/s]


Processing chunk 450... (rows: 100000)


Chunk 450: 100%|██████████| 1402/1402 [00:00<00:00, 21349.71it/s]


Processing chunk 451... (rows: 100000)


Chunk 451: 100%|██████████| 1272/1272 [00:00<00:00, 20842.81it/s]


Processing chunk 452... (rows: 100000)


Chunk 452: 100%|██████████| 1422/1422 [00:00<00:00, 21567.90it/s]


Processing chunk 453... (rows: 100000)


Chunk 453: 100%|██████████| 1690/1690 [00:00<00:00, 21296.32it/s]


Processing chunk 454... (rows: 100000)


Chunk 454: 100%|██████████| 1542/1542 [00:00<00:00, 20082.65it/s]


Processing chunk 455... (rows: 100000)


Chunk 455: 100%|██████████| 1341/1341 [00:00<00:00, 17308.79it/s]


Processing chunk 456... (rows: 100000)


Chunk 456: 100%|██████████| 1841/1841 [00:00<00:00, 21420.11it/s]


Processing chunk 457... (rows: 100000)


Chunk 457: 100%|██████████| 1717/1717 [00:00<00:00, 18633.73it/s]


Processing chunk 458... (rows: 100000)


Chunk 458: 100%|██████████| 1290/1290 [00:00<00:00, 20828.31it/s]


Processing chunk 459... (rows: 100000)


Chunk 459: 100%|██████████| 1598/1598 [00:00<00:00, 16858.07it/s]


Processing chunk 460... (rows: 100000)


Chunk 460: 100%|██████████| 1738/1738 [00:00<00:00, 19918.52it/s]


Processing chunk 461... (rows: 100000)


Chunk 461: 100%|██████████| 1644/1644 [00:00<00:00, 21724.27it/s]


Processing chunk 462... (rows: 100000)


Chunk 462: 100%|██████████| 1080/1080 [00:00<00:00, 19381.52it/s]


Processing chunk 463... (rows: 100000)


Chunk 463: 100%|██████████| 1873/1873 [00:00<00:00, 22188.76it/s]


Processing chunk 464... (rows: 100000)


Chunk 464: 100%|██████████| 1404/1404 [00:00<00:00, 20425.88it/s]


Processing chunk 465... (rows: 100000)


Chunk 465: 100%|██████████| 1406/1406 [00:00<00:00, 21405.49it/s]


Processing chunk 466... (rows: 100000)


Chunk 466: 100%|██████████| 1444/1444 [00:00<00:00, 21518.96it/s]


Processing chunk 467... (rows: 100000)


Chunk 467: 100%|██████████| 1164/1164 [00:00<00:00, 20847.58it/s]


Processing chunk 468... (rows: 100000)


Chunk 468: 100%|██████████| 1546/1546 [00:00<00:00, 21833.34it/s]


Processing chunk 469... (rows: 100000)


Chunk 469: 100%|██████████| 1361/1361 [00:00<00:00, 21637.42it/s]


Processing chunk 470... (rows: 100000)


Chunk 470: 100%|██████████| 1700/1700 [00:00<00:00, 22233.05it/s]


Processing chunk 471... (rows: 100000)


Chunk 471: 100%|██████████| 1441/1441 [00:00<00:00, 21598.41it/s]


Processing chunk 472... (rows: 100000)


Chunk 472: 100%|██████████| 1406/1406 [00:00<00:00, 21436.07it/s]


Processing chunk 473... (rows: 100000)


Chunk 473: 100%|██████████| 1635/1635 [00:00<00:00, 22028.27it/s]


Processing chunk 474... (rows: 100000)


Chunk 474: 100%|██████████| 1408/1408 [00:00<00:00, 21331.03it/s]


Processing chunk 475... (rows: 100000)


Chunk 475: 100%|██████████| 1670/1670 [00:00<00:00, 20494.98it/s]


Processing chunk 476... (rows: 100000)


Chunk 476: 100%|██████████| 1628/1628 [00:00<00:00, 21452.49it/s]


Processing chunk 477... (rows: 100000)


Chunk 477: 100%|██████████| 1730/1730 [00:00<00:00, 21401.21it/s]


Processing chunk 478... (rows: 100000)


Chunk 478: 100%|██████████| 1553/1553 [00:00<00:00, 20747.28it/s]


Processing chunk 479... (rows: 100000)


Chunk 479: 100%|██████████| 1112/1112 [00:00<00:00, 19764.00it/s]


Processing chunk 480... (rows: 100000)


Chunk 480: 100%|██████████| 1506/1506 [00:00<00:00, 21773.35it/s]


Processing chunk 481... (rows: 100000)


Chunk 481: 100%|██████████| 1692/1692 [00:00<00:00, 21781.30it/s]


Processing chunk 482... (rows: 100000)


Chunk 482: 100%|██████████| 1418/1418 [00:00<00:00, 20958.73it/s]


Processing chunk 483... (rows: 100000)


Chunk 483: 100%|██████████| 1506/1506 [00:00<00:00, 19860.84it/s]


Processing chunk 484... (rows: 100000)


Chunk 484: 100%|██████████| 1304/1304 [00:00<00:00, 20581.28it/s]


Processing chunk 485... (rows: 100000)


Chunk 485: 100%|██████████| 2079/2079 [00:00<00:00, 21725.43it/s]


Processing chunk 486... (rows: 100000)


Chunk 486: 100%|██████████| 1942/1942 [00:00<00:00, 21550.22it/s]


Processing chunk 487... (rows: 100000)


Chunk 487: 100%|██████████| 1584/1584 [00:00<00:00, 18881.66it/s]


Processing chunk 488... (rows: 100000)


Chunk 488: 100%|██████████| 1714/1714 [00:00<00:00, 20748.12it/s]


Processing chunk 489... (rows: 100000)


Chunk 489: 100%|██████████| 1671/1671 [00:00<00:00, 20926.75it/s]


Processing chunk 490... (rows: 100000)


Chunk 490: 100%|██████████| 1574/1574 [00:00<00:00, 21343.95it/s]


Processing chunk 491... (rows: 100000)


Chunk 491: 100%|██████████| 1619/1619 [00:00<00:00, 21772.56it/s]


Processing chunk 492... (rows: 100000)


Chunk 492: 100%|██████████| 1438/1438 [00:00<00:00, 20094.85it/s]


Processing chunk 493... (rows: 100000)


Chunk 493: 100%|██████████| 1618/1618 [00:00<00:00, 20948.34it/s]


Processing chunk 494... (rows: 100000)


Chunk 494: 100%|██████████| 1152/1152 [00:00<00:00, 20015.73it/s]


Processing chunk 495... (rows: 100000)


Chunk 495: 100%|██████████| 1570/1570 [00:00<00:00, 20287.37it/s]


Processing chunk 496... (rows: 100000)


Chunk 496: 100%|██████████| 1689/1689 [00:00<00:00, 21612.41it/s]


Processing chunk 497... (rows: 100000)


Chunk 497: 100%|██████████| 1973/1973 [00:00<00:00, 21920.85it/s]


Processing chunk 498... (rows: 100000)


Chunk 498: 100%|██████████| 1629/1629 [00:00<00:00, 21728.34it/s]


Processing chunk 499... (rows: 100000)


Chunk 499: 100%|██████████| 1379/1379 [00:00<00:00, 21449.19it/s]


Processing chunk 500... (rows: 100000)


Chunk 500: 100%|██████████| 1549/1549 [00:00<00:00, 21559.29it/s]


Processing chunk 501... (rows: 100000)


Chunk 501: 100%|██████████| 1609/1609 [00:00<00:00, 21222.26it/s]


Processing chunk 502... (rows: 100000)


Chunk 502: 100%|██████████| 1728/1728 [00:00<00:00, 22077.50it/s]


Processing chunk 503... (rows: 100000)


Chunk 503: 100%|██████████| 1620/1620 [00:00<00:00, 20669.70it/s]


Processing chunk 504... (rows: 100000)


Chunk 504: 100%|██████████| 1569/1569 [00:00<00:00, 21018.47it/s]


Processing chunk 505... (rows: 100000)


Chunk 505: 100%|██████████| 1232/1232 [00:00<00:00, 19514.28it/s]


Processing chunk 506... (rows: 100000)


Chunk 506: 100%|██████████| 1749/1749 [00:00<00:00, 21661.11it/s]


Processing chunk 507... (rows: 100000)


Chunk 507: 100%|██████████| 1625/1625 [00:00<00:00, 21481.19it/s]


Processing chunk 508... (rows: 100000)


Chunk 508: 100%|██████████| 740/740 [00:00<00:00, 20445.19it/s]


Processing chunk 509... (rows: 100000)


Chunk 509: 100%|██████████| 1343/1343 [00:00<00:00, 20766.64it/s]


Processing chunk 510... (rows: 100000)


Chunk 510: 100%|██████████| 1586/1586 [00:00<00:00, 17855.91it/s]


Processing chunk 511... (rows: 100000)


Chunk 511: 100%|██████████| 1668/1668 [00:00<00:00, 18367.24it/s]


Processing chunk 512... (rows: 100000)


Chunk 512: 100%|██████████| 1524/1524 [00:00<00:00, 19922.39it/s]


Processing chunk 513... (rows: 100000)


Chunk 513: 100%|██████████| 1862/1862 [00:00<00:00, 22084.27it/s]


Processing chunk 514... (rows: 100000)


Chunk 514: 100%|██████████| 1314/1314 [00:00<00:00, 20730.77it/s]


Processing chunk 515... (rows: 100000)


Chunk 515: 100%|██████████| 1738/1738 [00:00<00:00, 21716.54it/s]


Processing chunk 516... (rows: 100000)


Chunk 516: 100%|██████████| 1343/1343 [00:00<00:00, 21072.57it/s]


Processing chunk 517... (rows: 100000)


Chunk 517: 100%|██████████| 1363/1363 [00:00<00:00, 19665.62it/s]


Processing chunk 518... (rows: 100000)


Chunk 518: 100%|██████████| 1458/1458 [00:00<00:00, 21578.32it/s]


Processing chunk 519... (rows: 100000)


Chunk 519: 100%|██████████| 1366/1366 [00:00<00:00, 21524.93it/s]


Processing chunk 520... (rows: 100000)


Chunk 520: 100%|██████████| 1429/1429 [00:00<00:00, 21429.12it/s]


Processing chunk 521... (rows: 100000)


Chunk 521: 100%|██████████| 1242/1242 [00:00<00:00, 21792.97it/s]


Processing chunk 522... (rows: 100000)


Chunk 522: 100%|██████████| 1749/1749 [00:00<00:00, 20620.53it/s]


Processing chunk 523... (rows: 100000)


Chunk 523: 100%|██████████| 979/979 [00:00<00:00, 19062.81it/s]


Processing chunk 524... (rows: 100000)


Chunk 524: 100%|██████████| 1295/1295 [00:00<00:00, 20417.26it/s]


Processing chunk 525... (rows: 100000)


Chunk 525: 100%|██████████| 1141/1141 [00:00<00:00, 20961.97it/s]


Processing chunk 526... (rows: 100000)


Chunk 526: 100%|██████████| 1116/1116 [00:00<00:00, 21064.76it/s]


Processing chunk 527... (rows: 100000)


Chunk 527: 100%|██████████| 1068/1068 [00:00<00:00, 20841.84it/s]


Processing chunk 528... (rows: 100000)


Chunk 528: 100%|██████████| 1515/1515 [00:00<00:00, 21649.59it/s]


Processing chunk 529... (rows: 100000)


Chunk 529: 100%|██████████| 1008/1008 [00:00<00:00, 21245.73it/s]


Processing chunk 530... (rows: 100000)


Chunk 530: 100%|██████████| 1749/1749 [00:00<00:00, 21059.78it/s]


Processing chunk 531... (rows: 100000)


Chunk 531: 100%|██████████| 1039/1039 [00:00<00:00, 20133.25it/s]


Processing chunk 532... (rows: 100000)


Chunk 532: 100%|██████████| 1237/1237 [00:00<00:00, 18401.42it/s]


Processing chunk 533... (rows: 100000)


Chunk 533: 100%|██████████| 1439/1439 [00:00<00:00, 21924.53it/s]


Processing chunk 534... (rows: 100000)


Chunk 534: 100%|██████████| 969/969 [00:00<00:00, 20816.52it/s]


Processing chunk 535... (rows: 100000)


Chunk 535: 100%|██████████| 979/979 [00:00<00:00, 19960.35it/s]


Processing chunk 536... (rows: 100000)


Chunk 536: 100%|██████████| 1056/1056 [00:00<00:00, 21462.77it/s]


Processing chunk 537... (rows: 100000)


Chunk 537: 100%|██████████| 1109/1109 [00:00<00:00, 20838.58it/s]


Processing chunk 538... (rows: 100000)


Chunk 538: 100%|██████████| 1137/1137 [00:00<00:00, 21718.49it/s]


Processing chunk 539... (rows: 100000)


Chunk 539: 100%|██████████| 882/882 [00:00<00:00, 19906.78it/s]


Processing chunk 540... (rows: 100000)


Chunk 540: 100%|██████████| 1152/1152 [00:00<00:00, 21369.32it/s]


Processing chunk 541... (rows: 100000)


Chunk 541: 100%|██████████| 1084/1084 [00:00<00:00, 20408.96it/s]


Processing chunk 542... (rows: 100000)


Chunk 542: 100%|██████████| 1197/1197 [00:00<00:00, 20544.92it/s]


Processing chunk 543... (rows: 100000)


Chunk 543: 100%|██████████| 1415/1415 [00:00<00:00, 21719.98it/s]


Processing chunk 544... (rows: 100000)


Chunk 544: 100%|██████████| 1342/1342 [00:00<00:00, 20272.04it/s]


Processing chunk 545... (rows: 100000)


Chunk 545: 100%|██████████| 1751/1751 [00:00<00:00, 20568.78it/s]


Processing chunk 546... (rows: 100000)


Chunk 546: 100%|██████████| 1351/1351 [00:00<00:00, 21695.78it/s]


Processing chunk 547... (rows: 100000)


Chunk 547: 100%|██████████| 1309/1309 [00:00<00:00, 21472.24it/s]


Processing chunk 548... (rows: 100000)


Chunk 548: 100%|██████████| 940/940 [00:00<00:00, 19869.80it/s]


Processing chunk 549... (rows: 100000)


Chunk 549: 100%|██████████| 1076/1076 [00:00<00:00, 20818.67it/s]


Processing chunk 550... (rows: 100000)


Chunk 550: 100%|██████████| 1151/1151 [00:00<00:00, 21267.72it/s]


Processing chunk 551... (rows: 100000)


Chunk 551: 100%|██████████| 1477/1477 [00:00<00:00, 20620.19it/s]


Processing chunk 552... (rows: 100000)


Chunk 552: 100%|██████████| 950/950 [00:00<00:00, 20010.29it/s]


Processing chunk 553... (rows: 100000)


Chunk 553: 100%|██████████| 940/940 [00:00<00:00, 20151.32it/s]


Processing chunk 554... (rows: 100000)


Chunk 554: 100%|██████████| 1254/1254 [00:00<00:00, 19723.02it/s]


Processing chunk 555... (rows: 100000)


Chunk 555: 100%|██████████| 1056/1056 [00:00<00:00, 20816.77it/s]


Processing chunk 556... (rows: 100000)


Chunk 556: 100%|██████████| 1111/1111 [00:00<00:00, 21367.03it/s]


Processing chunk 557... (rows: 100000)


Chunk 557: 100%|██████████| 945/945 [00:00<00:00, 20996.85it/s]


Processing chunk 558... (rows: 100000)


Chunk 558: 100%|██████████| 1288/1288 [00:00<00:00, 20287.07it/s]


Processing chunk 559... (rows: 100000)


Chunk 559: 100%|██████████| 1092/1092 [00:00<00:00, 20273.01it/s]


Processing chunk 560... (rows: 100000)


Chunk 560: 100%|██████████| 1047/1047 [00:00<00:00, 21484.63it/s]


Processing chunk 561... (rows: 100000)


Chunk 561: 100%|██████████| 1155/1155 [00:00<00:00, 20976.61it/s]


Processing chunk 562... (rows: 100000)


Chunk 562: 100%|██████████| 976/976 [00:00<00:00, 20740.84it/s]


Processing chunk 563... (rows: 100000)


Chunk 563: 100%|██████████| 1211/1211 [00:00<00:00, 21312.59it/s]


Processing chunk 564... (rows: 100000)


Chunk 564: 100%|██████████| 971/971 [00:00<00:00, 20590.25it/s]


Processing chunk 565... (rows: 100000)


Chunk 565: 100%|██████████| 1033/1033 [00:00<00:00, 20216.39it/s]


Processing chunk 566... (rows: 100000)


Chunk 566: 100%|██████████| 1245/1245 [00:00<00:00, 20950.99it/s]


Processing chunk 567... (rows: 100000)


Chunk 567: 100%|██████████| 1190/1190 [00:00<00:00, 20642.54it/s]


Processing chunk 568... (rows: 100000)


Chunk 568: 100%|██████████| 959/959 [00:00<00:00, 19848.79it/s]


Processing chunk 569... (rows: 100000)


Chunk 569: 100%|██████████| 1015/1015 [00:00<00:00, 20940.06it/s]


Processing chunk 570... (rows: 100000)


Chunk 570: 100%|██████████| 1084/1084 [00:00<00:00, 19900.14it/s]


Processing chunk 571... (rows: 100000)


Chunk 571: 100%|██████████| 1168/1168 [00:00<00:00, 20795.26it/s]


Processing chunk 572... (rows: 100000)


Chunk 572: 100%|██████████| 1251/1251 [00:00<00:00, 20560.80it/s]


Processing chunk 573... (rows: 100000)


Chunk 573: 100%|██████████| 1303/1303 [00:00<00:00, 20726.95it/s]


Processing chunk 574... (rows: 100000)


Chunk 574: 100%|██████████| 1309/1309 [00:00<00:00, 21099.18it/s]


Processing chunk 575... (rows: 100000)


Chunk 575: 100%|██████████| 1534/1534 [00:00<00:00, 20445.00it/s]


Processing chunk 576... (rows: 100000)


Chunk 576: 100%|██████████| 1135/1135 [00:00<00:00, 18304.40it/s]


Processing chunk 577... (rows: 100000)


Chunk 577: 100%|██████████| 1127/1127 [00:00<00:00, 18202.54it/s]


Processing chunk 578... (rows: 100000)


Chunk 578: 100%|██████████| 1294/1294 [00:00<00:00, 21708.76it/s]


Processing chunk 579... (rows: 100000)


Chunk 579: 100%|██████████| 1305/1305 [00:00<00:00, 21122.86it/s]


Processing chunk 580... (rows: 100000)


Chunk 580: 100%|██████████| 1222/1222 [00:00<00:00, 21286.37it/s]


Processing chunk 581... (rows: 100000)


Chunk 581: 100%|██████████| 1236/1236 [00:00<00:00, 18573.70it/s]


Processing chunk 582... (rows: 100000)


Chunk 582: 100%|██████████| 1565/1565 [00:00<00:00, 19854.29it/s]


Processing chunk 583... (rows: 100000)


Chunk 583: 100%|██████████| 1287/1287 [00:00<00:00, 19370.41it/s]


Processing chunk 584... (rows: 100000)


Chunk 584: 100%|██████████| 1295/1295 [00:00<00:00, 21891.20it/s]


Processing chunk 585... (rows: 100000)


Chunk 585: 100%|██████████| 1231/1231 [00:00<00:00, 21051.47it/s]


Processing chunk 586... (rows: 100000)


Chunk 586: 100%|██████████| 1535/1535 [00:00<00:00, 21120.26it/s]


Processing chunk 587... (rows: 100000)


Chunk 587: 100%|██████████| 1102/1102 [00:00<00:00, 20869.26it/s]


Processing chunk 588... (rows: 100000)


Chunk 588: 100%|██████████| 1428/1428 [00:00<00:00, 21243.76it/s]


Processing chunk 589... (rows: 100000)


Chunk 589: 100%|██████████| 1095/1095 [00:00<00:00, 19823.99it/s]


Processing chunk 590... (rows: 100000)


Chunk 590: 100%|██████████| 1322/1322 [00:00<00:00, 21169.12it/s]


Processing chunk 591... (rows: 100000)


Chunk 591: 100%|██████████| 1060/1060 [00:00<00:00, 17991.83it/s]


Processing chunk 592... (rows: 100000)


Chunk 592: 100%|██████████| 1235/1235 [00:00<00:00, 21444.43it/s]


Processing chunk 593... (rows: 100000)


Chunk 593: 100%|██████████| 933/933 [00:00<00:00, 20637.95it/s]


Processing chunk 594... (rows: 100000)


Chunk 594: 100%|██████████| 1377/1377 [00:00<00:00, 21436.76it/s]


Processing chunk 595... (rows: 100000)


Chunk 595: 100%|██████████| 1338/1338 [00:00<00:00, 21653.16it/s]


Processing chunk 596... (rows: 100000)


Chunk 596: 100%|██████████| 1218/1218 [00:00<00:00, 22027.60it/s]


Processing chunk 597... (rows: 100000)


Chunk 597: 100%|██████████| 1111/1111 [00:00<00:00, 19373.11it/s]


Processing chunk 598... (rows: 100000)


Chunk 598: 100%|██████████| 1217/1217 [00:00<00:00, 19804.87it/s]


Processing chunk 599... (rows: 100000)


Chunk 599: 100%|██████████| 1023/1023 [00:00<00:00, 21155.88it/s]


Processing chunk 600... (rows: 100000)


Chunk 600: 100%|██████████| 1363/1363 [00:00<00:00, 21695.53it/s]


Processing chunk 601... (rows: 100000)


Chunk 601: 100%|██████████| 1097/1097 [00:00<00:00, 22477.75it/s]


Processing chunk 602... (rows: 100000)


Chunk 602: 100%|██████████| 1256/1256 [00:00<00:00, 20839.53it/s]


Processing chunk 603... (rows: 100000)


Chunk 603: 100%|██████████| 1328/1328 [00:00<00:00, 21519.14it/s]


Processing chunk 604... (rows: 100000)


Chunk 604: 100%|██████████| 1922/1922 [00:00<00:00, 21402.58it/s]


Processing chunk 605... (rows: 100000)


Chunk 605: 100%|██████████| 1530/1530 [00:00<00:00, 21574.69it/s]


Processing chunk 606... (rows: 100000)


Chunk 606: 100%|██████████| 1782/1782 [00:00<00:00, 21681.65it/s]


Processing chunk 607... (rows: 100000)


Chunk 607: 100%|██████████| 1391/1391 [00:00<00:00, 18498.96it/s]


Processing chunk 608... (rows: 100000)


Chunk 608: 100%|██████████| 1773/1773 [00:00<00:00, 20428.26it/s]


Processing chunk 609... (rows: 100000)


Chunk 609: 100%|██████████| 1705/1705 [00:00<00:00, 21822.14it/s]


Processing chunk 610... (rows: 100000)


Chunk 610: 100%|██████████| 1452/1452 [00:00<00:00, 20865.25it/s]


Processing chunk 611... (rows: 100000)


Chunk 611: 100%|██████████| 1512/1512 [00:00<00:00, 21300.97it/s]


Processing chunk 612... (rows: 100000)


Chunk 612: 100%|██████████| 1584/1584 [00:00<00:00, 20837.47it/s]


Processing chunk 613... (rows: 100000)


Chunk 613: 100%|██████████| 1323/1323 [00:00<00:00, 20391.30it/s]


Processing chunk 614... (rows: 100000)


Chunk 614: 100%|██████████| 1392/1392 [00:00<00:00, 20726.14it/s]


Processing chunk 615... (rows: 100000)


Chunk 615: 100%|██████████| 1449/1449 [00:00<00:00, 21232.42it/s]


Processing chunk 616... (rows: 100000)


Chunk 616: 100%|██████████| 1412/1412 [00:00<00:00, 20919.22it/s]


Processing chunk 617... (rows: 100000)


Chunk 617: 100%|██████████| 1363/1363 [00:00<00:00, 20841.40it/s]


Processing chunk 618... (rows: 100000)


Chunk 618: 100%|██████████| 1195/1195 [00:00<00:00, 20886.40it/s]


Processing chunk 619... (rows: 100000)


Chunk 619: 100%|██████████| 1064/1064 [00:00<00:00, 19969.48it/s]


Processing chunk 620... (rows: 100000)


Chunk 620: 100%|██████████| 1033/1033 [00:00<00:00, 18108.29it/s]


Processing chunk 621... (rows: 100000)


Chunk 621: 100%|██████████| 1574/1574 [00:00<00:00, 21293.36it/s]


Processing chunk 622... (rows: 100000)


Chunk 622: 100%|██████████| 1309/1309 [00:00<00:00, 19572.79it/s]


Processing chunk 623... (rows: 100000)


Chunk 623: 100%|██████████| 1484/1484 [00:00<00:00, 21399.36it/s]


Processing chunk 624... (rows: 100000)


Chunk 624: 100%|██████████| 1245/1245 [00:00<00:00, 21394.69it/s]


Processing chunk 625... (rows: 100000)


Chunk 625: 100%|██████████| 1330/1330 [00:00<00:00, 21739.43it/s]


Processing chunk 626... (rows: 100000)


Chunk 626: 100%|██████████| 1369/1369 [00:00<00:00, 21382.38it/s]


Processing chunk 627... (rows: 100000)


Chunk 627: 100%|██████████| 1310/1310 [00:00<00:00, 21445.62it/s]


Processing chunk 628... (rows: 100000)


Chunk 628: 100%|██████████| 1646/1646 [00:00<00:00, 21750.22it/s]


Processing chunk 629... (rows: 100000)


Chunk 629: 100%|██████████| 1430/1430 [00:00<00:00, 21217.52it/s]


Processing chunk 630... (rows: 100000)


Chunk 630: 100%|██████████| 1395/1395 [00:00<00:00, 21327.05it/s]


Processing chunk 631... (rows: 100000)


Chunk 631: 100%|██████████| 1265/1265 [00:00<00:00, 20586.72it/s]


Processing chunk 632... (rows: 100000)


Chunk 632: 100%|██████████| 1339/1339 [00:00<00:00, 21286.77it/s]


Processing chunk 633... (rows: 100000)


Chunk 633: 100%|██████████| 1108/1108 [00:00<00:00, 20347.86it/s]


Processing chunk 634... (rows: 100000)


Chunk 634: 100%|██████████| 1780/1780 [00:00<00:00, 21448.82it/s]


Processing chunk 635... (rows: 100000)


Chunk 635: 100%|██████████| 1406/1406 [00:00<00:00, 19577.95it/s]


Processing chunk 636... (rows: 100000)


Chunk 636: 100%|██████████| 1235/1235 [00:00<00:00, 20492.40it/s]


Processing chunk 637... (rows: 100000)


Chunk 637: 100%|██████████| 1403/1403 [00:00<00:00, 18188.08it/s]


Processing chunk 638... (rows: 100000)


Chunk 638: 100%|██████████| 1543/1543 [00:00<00:00, 20544.39it/s]


Processing chunk 639... (rows: 100000)


Chunk 639: 100%|██████████| 1611/1611 [00:00<00:00, 19105.02it/s]


Processing chunk 640... (rows: 100000)


Chunk 640: 100%|██████████| 1619/1619 [00:00<00:00, 16229.75it/s]


Processing chunk 641... (rows: 100000)


Chunk 641: 100%|██████████| 1771/1771 [00:00<00:00, 16332.38it/s]


Processing chunk 642... (rows: 100000)


Chunk 642: 100%|██████████| 1545/1545 [00:00<00:00, 20752.31it/s]


Processing chunk 643... (rows: 100000)


Chunk 643: 100%|██████████| 1326/1326 [00:00<00:00, 21959.62it/s]


Processing chunk 644... (rows: 100000)


Chunk 644: 100%|██████████| 1711/1711 [00:00<00:00, 21404.11it/s]


Processing chunk 645... (rows: 100000)


Chunk 645: 100%|██████████| 1798/1798 [00:00<00:00, 20686.14it/s]


Processing chunk 646... (rows: 100000)


Chunk 646: 100%|██████████| 1587/1587 [00:00<00:00, 21969.42it/s]


Processing chunk 647... (rows: 100000)


Chunk 647: 100%|██████████| 1550/1550 [00:00<00:00, 21417.06it/s]


Processing chunk 648... (rows: 100000)


Chunk 648: 100%|██████████| 1519/1519 [00:00<00:00, 21571.74it/s]


Processing chunk 649... (rows: 100000)


Chunk 649: 100%|██████████| 1436/1436 [00:00<00:00, 20471.49it/s]


Processing chunk 650... (rows: 100000)


Chunk 650: 100%|██████████| 1550/1550 [00:00<00:00, 22099.15it/s]


Processing chunk 651... (rows: 100000)


Chunk 651: 100%|██████████| 1413/1413 [00:00<00:00, 21146.39it/s]


Processing chunk 652... (rows: 100000)


Chunk 652: 100%|██████████| 1261/1261 [00:00<00:00, 20455.67it/s]


Processing chunk 653... (rows: 100000)


Chunk 653: 100%|██████████| 1148/1148 [00:00<00:00, 20250.92it/s]


Processing chunk 654... (rows: 100000)


Chunk 654: 100%|██████████| 982/982 [00:00<00:00, 20729.60it/s]


Processing chunk 655... (rows: 100000)


Chunk 655: 100%|██████████| 1337/1337 [00:00<00:00, 20448.08it/s]


Processing chunk 656... (rows: 100000)


Chunk 656: 100%|██████████| 1408/1408 [00:00<00:00, 20984.26it/s]


Processing chunk 657... (rows: 100000)


Chunk 657: 100%|██████████| 1363/1363 [00:00<00:00, 20662.87it/s]


Processing chunk 658... (rows: 100000)


Chunk 658: 100%|██████████| 1586/1586 [00:00<00:00, 21376.95it/s]


Processing chunk 659... (rows: 100000)


Chunk 659: 100%|██████████| 1245/1245 [00:00<00:00, 20615.75it/s]


Processing chunk 660... (rows: 100000)


Chunk 660: 100%|██████████| 1226/1226 [00:00<00:00, 20977.59it/s]


Processing chunk 661... (rows: 100000)


Chunk 661: 100%|██████████| 1240/1240 [00:00<00:00, 20768.69it/s]


Processing chunk 662... (rows: 100000)


Chunk 662: 100%|██████████| 1354/1354 [00:00<00:00, 21188.97it/s]


Processing chunk 663... (rows: 100000)


Chunk 663: 100%|██████████| 1206/1206 [00:00<00:00, 20507.72it/s]


Processing chunk 664... (rows: 100000)


Chunk 664: 100%|██████████| 1040/1040 [00:00<00:00, 19613.21it/s]


Processing chunk 665... (rows: 100000)


Chunk 665: 100%|██████████| 1227/1227 [00:00<00:00, 20965.54it/s]


Processing chunk 666... (rows: 100000)


Chunk 666: 100%|██████████| 984/984 [00:00<00:00, 20081.62it/s]


Processing chunk 667... (rows: 100000)


Chunk 667: 100%|██████████| 1297/1297 [00:00<00:00, 21284.39it/s]


Processing chunk 668... (rows: 100000)


Chunk 668: 100%|██████████| 1237/1237 [00:00<00:00, 21399.86it/s]


Processing chunk 669... (rows: 100000)


Chunk 669: 100%|██████████| 1219/1219 [00:00<00:00, 20675.47it/s]


Processing chunk 670... (rows: 100000)


Chunk 670: 100%|██████████| 1209/1209 [00:00<00:00, 20026.67it/s]


Processing chunk 671... (rows: 100000)


Chunk 671: 100%|██████████| 1240/1240 [00:00<00:00, 20502.85it/s]


Processing chunk 672... (rows: 100000)


Chunk 672: 100%|██████████| 1618/1618 [00:00<00:00, 20903.37it/s]


Processing chunk 673... (rows: 100000)


Chunk 673: 100%|██████████| 1461/1461 [00:00<00:00, 20365.23it/s]


Processing chunk 674... (rows: 100000)


Chunk 674: 100%|██████████| 1614/1614 [00:00<00:00, 21602.74it/s]


Processing chunk 675... (rows: 100000)


Chunk 675: 100%|██████████| 1429/1429 [00:00<00:00, 20412.91it/s]


Processing chunk 676... (rows: 100000)


Chunk 676: 100%|██████████| 1356/1356 [00:00<00:00, 20472.98it/s]


Processing chunk 677... (rows: 100000)


Chunk 677: 100%|██████████| 1329/1329 [00:00<00:00, 20444.56it/s]


Processing chunk 678... (rows: 100000)


Chunk 678: 100%|██████████| 1443/1443 [00:00<00:00, 21087.63it/s]


Processing chunk 679... (rows: 100000)


Chunk 679: 100%|██████████| 1558/1558 [00:00<00:00, 20161.56it/s]


Processing chunk 680... (rows: 100000)


Chunk 680: 100%|██████████| 1400/1400 [00:00<00:00, 21955.60it/s]


Processing chunk 681... (rows: 100000)


Chunk 681: 100%|██████████| 1224/1224 [00:00<00:00, 19834.37it/s]


Processing chunk 682... (rows: 100000)


Chunk 682: 100%|██████████| 1216/1216 [00:00<00:00, 20560.15it/s]


Processing chunk 683... (rows: 100000)


Chunk 683: 100%|██████████| 1233/1233 [00:00<00:00, 20737.07it/s]


Processing chunk 684... (rows: 100000)


Chunk 684: 100%|██████████| 1024/1024 [00:00<00:00, 20005.25it/s]


Processing chunk 685... (rows: 100000)


Chunk 685: 100%|██████████| 965/965 [00:00<00:00, 19920.68it/s]


Processing chunk 686... (rows: 100000)


Chunk 686: 100%|██████████| 1359/1359 [00:00<00:00, 20474.79it/s]


Processing chunk 687... (rows: 100000)


Chunk 687: 100%|██████████| 1347/1347 [00:00<00:00, 21132.48it/s]


Processing chunk 688... (rows: 100000)


Chunk 688: 100%|██████████| 1124/1124 [00:00<00:00, 20658.88it/s]


Processing chunk 689... (rows: 100000)


Chunk 689: 100%|██████████| 1333/1333 [00:00<00:00, 20925.68it/s]


Processing chunk 690... (rows: 100000)


Chunk 690: 100%|██████████| 1051/1051 [00:00<00:00, 20635.00it/s]


Processing chunk 691... (rows: 100000)


Chunk 691: 100%|██████████| 1255/1255 [00:00<00:00, 21095.06it/s]


Processing chunk 692... (rows: 100000)


Chunk 692: 100%|██████████| 1116/1116 [00:00<00:00, 20623.73it/s]


Processing chunk 693... (rows: 100000)


Chunk 693: 100%|██████████| 1282/1282 [00:00<00:00, 21343.45it/s]


Processing chunk 694... (rows: 100000)


Chunk 694: 100%|██████████| 1311/1311 [00:00<00:00, 21300.45it/s]


Processing chunk 695... (rows: 100000)


Chunk 695: 100%|██████████| 1312/1312 [00:00<00:00, 20691.12it/s]


Processing chunk 696... (rows: 100000)


Chunk 696: 100%|██████████| 1592/1592 [00:00<00:00, 21253.61it/s]


Processing chunk 697... (rows: 100000)


Chunk 697: 100%|██████████| 1328/1328 [00:00<00:00, 18988.52it/s]


Processing chunk 698... (rows: 100000)


Chunk 698: 100%|██████████| 1439/1439 [00:00<00:00, 21578.46it/s]


Processing chunk 699... (rows: 100000)


Chunk 699: 100%|██████████| 1224/1224 [00:00<00:00, 11286.12it/s]


Processing chunk 700... (rows: 100000)


Chunk 700: 100%|██████████| 1538/1538 [00:00<00:00, 18107.62it/s]


Processing chunk 701... (rows: 100000)


Chunk 701: 100%|██████████| 1052/1052 [00:00<00:00, 18250.13it/s]


Processing chunk 702... (rows: 100000)


Chunk 702: 100%|██████████| 1583/1583 [00:00<00:00, 21035.57it/s]


Processing chunk 703... (rows: 100000)


Chunk 703: 100%|██████████| 1638/1638 [00:00<00:00, 19152.39it/s]


Processing chunk 704... (rows: 100000)


Chunk 704: 100%|██████████| 1299/1299 [00:00<00:00, 20347.92it/s]


Processing chunk 705... (rows: 100000)


Chunk 705: 100%|██████████| 1102/1102 [00:00<00:00, 19475.18it/s]


Processing chunk 706... (rows: 100000)


Chunk 706: 100%|██████████| 1398/1398 [00:00<00:00, 21081.53it/s]


Processing chunk 707... (rows: 100000)


Chunk 707: 100%|██████████| 1223/1223 [00:00<00:00, 21075.43it/s]


Processing chunk 708... (rows: 100000)


Chunk 708: 100%|██████████| 1333/1333 [00:00<00:00, 21079.77it/s]


Processing chunk 709... (rows: 100000)


Chunk 709: 100%|██████████| 1379/1379 [00:00<00:00, 21160.34it/s]


Processing chunk 710... (rows: 100000)


Chunk 710: 100%|██████████| 1166/1166 [00:00<00:00, 21145.98it/s]


Processing chunk 711... (rows: 100000)


Chunk 711: 100%|██████████| 1367/1367 [00:00<00:00, 20243.59it/s]


Processing chunk 712... (rows: 100000)


Chunk 712: 100%|██████████| 1226/1226 [00:00<00:00, 20738.68it/s]


Processing chunk 713... (rows: 100000)


Chunk 713: 100%|██████████| 1321/1321 [00:00<00:00, 21508.67it/s]


Processing chunk 714... (rows: 100000)


Chunk 714: 100%|██████████| 1172/1172 [00:00<00:00, 21266.29it/s]


Processing chunk 715... (rows: 100000)


Chunk 715: 100%|██████████| 1140/1140 [00:00<00:00, 20883.95it/s]


Processing chunk 716... (rows: 100000)


Chunk 716: 100%|██████████| 1307/1307 [00:00<00:00, 20432.49it/s]


Processing chunk 717... (rows: 100000)


Chunk 717: 100%|██████████| 1211/1211 [00:00<00:00, 20231.59it/s]


Processing chunk 718... (rows: 100000)


Chunk 718: 100%|██████████| 1149/1149 [00:00<00:00, 20367.15it/s]


Processing chunk 719... (rows: 100000)


Chunk 719: 100%|██████████| 1274/1274 [00:00<00:00, 21475.97it/s]


Processing chunk 720... (rows: 100000)


Chunk 720: 100%|██████████| 1381/1381 [00:00<00:00, 21224.96it/s]


Processing chunk 721... (rows: 100000)


Chunk 721: 100%|██████████| 1558/1558 [00:00<00:00, 21368.86it/s]


Processing chunk 722... (rows: 100000)


Chunk 722: 100%|██████████| 1491/1491 [00:00<00:00, 20145.76it/s]


Processing chunk 723... (rows: 100000)


Chunk 723: 100%|██████████| 1422/1422 [00:00<00:00, 21916.94it/s]


Processing chunk 724... (rows: 100000)


Chunk 724: 100%|██████████| 1710/1710 [00:00<00:00, 20934.00it/s]


Processing chunk 725... (rows: 100000)


Chunk 725: 100%|██████████| 2150/2150 [00:00<00:00, 20934.71it/s]


Processing chunk 726... (rows: 100000)


Chunk 726: 100%|██████████| 2099/2099 [00:00<00:00, 19017.49it/s]


Processing chunk 727... (rows: 100000)


Chunk 727: 100%|██████████| 1714/1714 [00:00<00:00, 20441.58it/s]


Processing chunk 728... (rows: 100000)


Chunk 728: 100%|██████████| 1435/1435 [00:00<00:00, 20650.47it/s]


Processing chunk 729... (rows: 100000)


Chunk 729: 100%|██████████| 1679/1679 [00:00<00:00, 21636.40it/s]


Processing chunk 730... (rows: 100000)


Chunk 730: 100%|██████████| 1803/1803 [00:00<00:00, 21098.37it/s]


Processing chunk 731... (rows: 100000)


Chunk 731: 100%|██████████| 1778/1778 [00:00<00:00, 21905.01it/s]


Processing chunk 732... (rows: 100000)


Chunk 732: 100%|██████████| 1587/1587 [00:00<00:00, 20932.74it/s]


Processing chunk 733... (rows: 100000)


Chunk 733: 100%|██████████| 1784/1784 [00:00<00:00, 19937.96it/s]


Processing chunk 734... (rows: 100000)


Chunk 734: 100%|██████████| 1746/1746 [00:00<00:00, 19841.97it/s]


Processing chunk 735... (rows: 100000)


Chunk 735: 100%|██████████| 1710/1710 [00:00<00:00, 21510.28it/s]


Processing chunk 736... (rows: 100000)


Chunk 736: 100%|██████████| 1640/1640 [00:00<00:00, 21701.29it/s]


Processing chunk 737... (rows: 100000)


Chunk 737: 100%|██████████| 1444/1444 [00:00<00:00, 21586.60it/s]


Processing chunk 738... (rows: 100000)


Chunk 738: 100%|██████████| 1651/1651 [00:00<00:00, 21221.98it/s]


Processing chunk 739... (rows: 100000)


Chunk 739: 100%|██████████| 1888/1888 [00:00<00:00, 21764.16it/s]


Processing chunk 740... (rows: 100000)


Chunk 740: 100%|██████████| 1469/1469 [00:00<00:00, 21364.71it/s]


Processing chunk 741... (rows: 100000)


Chunk 741: 100%|██████████| 1217/1217 [00:00<00:00, 20952.24it/s]


Processing chunk 742... (rows: 100000)


Chunk 742: 100%|██████████| 1043/1043 [00:00<00:00, 20934.99it/s]


Processing chunk 743... (rows: 100000)


Chunk 743: 100%|██████████| 1731/1731 [00:00<00:00, 21480.68it/s]


Processing chunk 744... (rows: 100000)


Chunk 744: 100%|██████████| 1511/1511 [00:00<00:00, 20539.92it/s]


Processing chunk 745... (rows: 100000)


Chunk 745: 100%|██████████| 1316/1316 [00:00<00:00, 19005.61it/s]


Processing chunk 746... (rows: 100000)


Chunk 746: 100%|██████████| 1832/1832 [00:00<00:00, 19188.71it/s]


Processing chunk 747... (rows: 100000)


Chunk 747: 100%|██████████| 1865/1865 [00:00<00:00, 20053.88it/s]


Processing chunk 748... (rows: 100000)


Chunk 748: 100%|██████████| 1653/1653 [00:00<00:00, 20586.99it/s]


Processing chunk 749... (rows: 100000)


Chunk 749: 100%|██████████| 1595/1595 [00:00<00:00, 21147.19it/s]


Processing chunk 750... (rows: 100000)


Chunk 750: 100%|██████████| 1786/1786 [00:00<00:00, 21767.13it/s]


Processing chunk 751... (rows: 100000)


Chunk 751: 100%|██████████| 1878/1878 [00:00<00:00, 20040.10it/s]


Processing chunk 752... (rows: 100000)


Chunk 752: 100%|██████████| 1809/1809 [00:00<00:00, 20874.07it/s]


Processing chunk 753... (rows: 100000)


Chunk 753: 100%|██████████| 1387/1387 [00:00<00:00, 21631.30it/s]


Processing chunk 754... (rows: 100000)


Chunk 754: 100%|██████████| 1261/1261 [00:00<00:00, 19999.91it/s]


Processing chunk 755... (rows: 100000)


Chunk 755: 100%|██████████| 1809/1809 [00:00<00:00, 21593.17it/s]


Processing chunk 756... (rows: 100000)


Chunk 756: 100%|██████████| 1812/1812 [00:00<00:00, 21368.40it/s]


Processing chunk 757... (rows: 100000)


Chunk 757: 100%|██████████| 1656/1656 [00:00<00:00, 19228.95it/s]


Processing chunk 758... (rows: 100000)


Chunk 758: 100%|██████████| 2198/2198 [00:00<00:00, 18779.04it/s]


Processing chunk 759... (rows: 100000)


Chunk 759: 100%|██████████| 1766/1766 [00:00<00:00, 18450.88it/s]


Processing chunk 760... (rows: 100000)


Chunk 760: 100%|██████████| 2180/2180 [00:00<00:00, 15692.23it/s]


Processing chunk 761... (rows: 100000)


Chunk 761: 100%|██████████| 1962/1962 [00:00<00:00, 20742.27it/s]


Processing chunk 762... (rows: 100000)


Chunk 762: 100%|██████████| 1604/1604 [00:00<00:00, 21323.41it/s]


Processing chunk 763... (rows: 100000)


Chunk 763: 100%|██████████| 1332/1332 [00:00<00:00, 21276.45it/s]


Processing chunk 764... (rows: 100000)


Chunk 764: 100%|██████████| 1973/1973 [00:00<00:00, 21697.33it/s]


Processing chunk 765... (rows: 100000)


Chunk 765: 100%|██████████| 1646/1646 [00:00<00:00, 20704.54it/s]


Processing chunk 766... (rows: 100000)


Chunk 766: 100%|██████████| 2123/2123 [00:00<00:00, 21405.07it/s]


Processing chunk 767... (rows: 100000)


Chunk 767: 100%|██████████| 1709/1709 [00:00<00:00, 18398.67it/s]


Processing chunk 768... (rows: 100000)


Chunk 768: 100%|██████████| 1857/1857 [00:00<00:00, 20029.68it/s]


Processing chunk 769... (rows: 100000)


Chunk 769: 100%|██████████| 1819/1819 [00:00<00:00, 21694.88it/s]


Processing chunk 770... (rows: 100000)


Chunk 770: 100%|██████████| 1856/1856 [00:00<00:00, 21112.29it/s]


Processing chunk 771... (rows: 100000)


Chunk 771: 100%|██████████| 1547/1547 [00:00<00:00, 20743.17it/s]


Processing chunk 772... (rows: 100000)


Chunk 772: 100%|██████████| 1774/1774 [00:00<00:00, 20774.49it/s]


Processing chunk 773... (rows: 100000)


Chunk 773: 100%|██████████| 1994/1994 [00:00<00:00, 20037.72it/s]


Processing chunk 774... (rows: 100000)


Chunk 774: 100%|██████████| 1819/1819 [00:00<00:00, 21059.33it/s]


Processing chunk 775... (rows: 100000)


Chunk 775: 100%|██████████| 1196/1196 [00:00<00:00, 20427.44it/s]


Processing chunk 776... (rows: 100000)


Chunk 776: 100%|██████████| 1477/1477 [00:00<00:00, 20502.48it/s]


Processing chunk 777... (rows: 100000)


Chunk 777: 100%|██████████| 1950/1950 [00:00<00:00, 20845.27it/s]


Processing chunk 778... (rows: 100000)


Chunk 778: 100%|██████████| 1440/1440 [00:00<00:00, 20058.38it/s]


Processing chunk 779... (rows: 100000)


Chunk 779: 100%|██████████| 1402/1402 [00:00<00:00, 20646.43it/s]


Processing chunk 780... (rows: 100000)


Chunk 780: 100%|██████████| 1467/1467 [00:00<00:00, 19507.77it/s]


Processing chunk 781... (rows: 100000)


Chunk 781: 100%|██████████| 1660/1660 [00:00<00:00, 21234.26it/s]


Processing chunk 782... (rows: 100000)


Chunk 782: 100%|██████████| 1734/1734 [00:00<00:00, 17521.32it/s]


Processing chunk 783... (rows: 100000)


Chunk 783: 100%|██████████| 1312/1312 [00:00<00:00, 20103.56it/s]


Processing chunk 784... (rows: 100000)


Chunk 784: 100%|██████████| 1997/1997 [00:00<00:00, 20108.72it/s]


Processing chunk 785... (rows: 100000)


Chunk 785: 100%|██████████| 1310/1310 [00:00<00:00, 20667.19it/s]


Processing chunk 786... (rows: 100000)


Chunk 786: 100%|██████████| 1428/1428 [00:00<00:00, 20162.62it/s]


Processing chunk 787... (rows: 100000)


Chunk 787: 100%|██████████| 1411/1411 [00:00<00:00, 19599.23it/s]


Processing chunk 788... (rows: 100000)


Chunk 788: 100%|██████████| 1563/1563 [00:00<00:00, 18032.80it/s]


Processing chunk 789... (rows: 100000)


Chunk 789: 100%|██████████| 1840/1840 [00:00<00:00, 21373.67it/s]


Processing chunk 790... (rows: 100000)


Chunk 790: 100%|██████████| 1867/1867 [00:00<00:00, 21554.37it/s]


Processing chunk 791... (rows: 100000)


Chunk 791: 100%|██████████| 1474/1474 [00:00<00:00, 20207.50it/s]


Processing chunk 792... (rows: 100000)


Chunk 792: 100%|██████████| 1565/1565 [00:00<00:00, 19766.52it/s]


Processing chunk 793... (rows: 100000)


Chunk 793: 100%|██████████| 1472/1472 [00:00<00:00, 20336.22it/s]


Processing chunk 794... (rows: 100000)


Chunk 794: 100%|██████████| 1567/1567 [00:00<00:00, 20788.90it/s]


Processing chunk 795... (rows: 100000)


Chunk 795: 100%|██████████| 1360/1360 [00:00<00:00, 19776.56it/s]


Processing chunk 796... (rows: 100000)


Chunk 796: 100%|██████████| 1590/1590 [00:00<00:00, 21064.39it/s]


Processing chunk 797... (rows: 100000)


Chunk 797: 100%|██████████| 1347/1347 [00:00<00:00, 22456.36it/s]


Processing chunk 798... (rows: 100000)


Chunk 798: 100%|██████████| 1574/1574 [00:00<00:00, 20206.52it/s]


Processing chunk 799... (rows: 100000)


Chunk 799: 100%|██████████| 1614/1614 [00:00<00:00, 21065.75it/s]


Processing chunk 800... (rows: 100000)


Chunk 800: 100%|██████████| 1845/1845 [00:00<00:00, 21759.34it/s]


Processing chunk 801... (rows: 100000)


Chunk 801: 100%|██████████| 1586/1586 [00:00<00:00, 21220.86it/s]


Processing chunk 802... (rows: 100000)


Chunk 802: 100%|██████████| 1739/1739 [00:00<00:00, 21394.93it/s]


Processing chunk 803... (rows: 100000)


Chunk 803: 100%|██████████| 1373/1373 [00:00<00:00, 20580.15it/s]


Processing chunk 804... (rows: 100000)


Chunk 804: 100%|██████████| 1629/1629 [00:00<00:00, 20818.04it/s]


Processing chunk 805... (rows: 100000)


Chunk 805: 100%|██████████| 1402/1402 [00:00<00:00, 20347.95it/s]


Processing chunk 806... (rows: 100000)


Chunk 806: 100%|██████████| 1730/1730 [00:00<00:00, 17977.49it/s]


Processing chunk 807... (rows: 100000)


Chunk 807: 100%|██████████| 1515/1515 [00:00<00:00, 18903.49it/s]


Processing chunk 808... (rows: 100000)


Chunk 808: 100%|██████████| 1302/1302 [00:00<00:00, 21337.89it/s]


Processing chunk 809... (rows: 100000)


Chunk 809: 100%|██████████| 1393/1393 [00:00<00:00, 19420.66it/s]


Processing chunk 810... (rows: 100000)


Chunk 810: 100%|██████████| 1350/1350 [00:00<00:00, 20602.51it/s]


Processing chunk 811... (rows: 100000)


Chunk 811: 100%|██████████| 1672/1672 [00:00<00:00, 21706.85it/s]


Processing chunk 812... (rows: 100000)


Chunk 812: 100%|██████████| 2104/2104 [00:00<00:00, 20283.95it/s]


Processing chunk 813... (rows: 100000)


Chunk 813: 100%|██████████| 1323/1323 [00:00<00:00, 20304.23it/s]


Processing chunk 814... (rows: 100000)


Chunk 814: 100%|██████████| 1117/1117 [00:00<00:00, 19711.62it/s]


Processing chunk 815... (rows: 100000)


Chunk 815: 100%|██████████| 1346/1346 [00:00<00:00, 20441.35it/s]


Processing chunk 816... (rows: 100000)


Chunk 816: 100%|██████████| 1485/1485 [00:00<00:00, 20184.13it/s]


Processing chunk 817... (rows: 100000)


Chunk 817: 100%|██████████| 1366/1366 [00:00<00:00, 17054.38it/s]


Processing chunk 818... (rows: 100000)


Chunk 818: 100%|██████████| 1268/1268 [00:00<00:00, 19300.88it/s]


Processing chunk 819... (rows: 100000)


Chunk 819: 100%|██████████| 1018/1018 [00:00<00:00, 18864.71it/s]


Processing chunk 820... (rows: 100000)


Chunk 820: 100%|██████████| 903/903 [00:00<00:00, 20003.47it/s]


Processing chunk 821... (rows: 100000)


Chunk 821: 100%|██████████| 1227/1227 [00:00<00:00, 18279.50it/s]


Processing chunk 822... (rows: 100000)


Chunk 822: 100%|██████████| 1202/1202 [00:00<00:00, 21084.13it/s]


Processing chunk 823... (rows: 100000)


Chunk 823: 100%|██████████| 1042/1042 [00:00<00:00, 19383.63it/s]


Processing chunk 824... (rows: 100000)


Chunk 824: 100%|██████████| 1528/1528 [00:00<00:00, 19777.92it/s]


Processing chunk 825... (rows: 100000)


Chunk 825: 100%|██████████| 1430/1430 [00:00<00:00, 20194.59it/s]


Processing chunk 826... (rows: 100000)


Chunk 826: 100%|██████████| 1452/1452 [00:00<00:00, 16853.50it/s]


Processing chunk 827... (rows: 100000)


Chunk 827: 100%|██████████| 1613/1613 [00:00<00:00, 20518.41it/s]


Processing chunk 828... (rows: 100000)


Chunk 828: 100%|██████████| 1334/1334 [00:00<00:00, 19693.72it/s]


Processing chunk 829... (rows: 100000)


Chunk 829: 100%|██████████| 1255/1255 [00:00<00:00, 18571.90it/s]


Processing chunk 830... (rows: 100000)


Chunk 830: 100%|██████████| 1144/1144 [00:00<00:00, 20823.35it/s]


Processing chunk 831... (rows: 100000)


Chunk 831: 100%|██████████| 1306/1306 [00:00<00:00, 20937.85it/s]


Processing chunk 832... (rows: 100000)


Chunk 832: 100%|██████████| 939/939 [00:00<00:00, 19073.14it/s]


Processing chunk 833... (rows: 100000)


Chunk 833: 100%|██████████| 847/847 [00:00<00:00, 18688.52it/s]


Processing chunk 834... (rows: 100000)


Chunk 834: 100%|██████████| 886/886 [00:00<00:00, 20068.98it/s]


Processing chunk 835... (rows: 100000)


Chunk 835: 100%|██████████| 1105/1105 [00:00<00:00, 19933.36it/s]


Processing chunk 836... (rows: 100000)


Chunk 836: 100%|██████████| 898/898 [00:00<00:00, 18125.01it/s]


Processing chunk 837... (rows: 100000)


Chunk 837: 100%|██████████| 1069/1069 [00:00<00:00, 20541.47it/s]


Processing chunk 838... (rows: 100000)


Chunk 838: 100%|██████████| 1386/1386 [00:00<00:00, 20024.89it/s]


Processing chunk 839... (rows: 100000)


Chunk 839: 100%|██████████| 1453/1453 [00:00<00:00, 19197.68it/s]


Processing chunk 840... (rows: 100000)


Chunk 840: 100%|██████████| 802/802 [00:00<00:00, 19566.38it/s]


Processing chunk 841... (rows: 100000)


Chunk 841: 100%|██████████| 1263/1263 [00:00<00:00, 20831.16it/s]


Processing chunk 842... (rows: 100000)


Chunk 842: 100%|██████████| 1224/1224 [00:00<00:00, 20621.18it/s]


Processing chunk 843... (rows: 100000)


Chunk 843: 100%|██████████| 1108/1108 [00:00<00:00, 20353.74it/s]


Processing chunk 844... (rows: 100000)


Chunk 844: 100%|██████████| 1393/1393 [00:00<00:00, 20908.25it/s]


Processing chunk 845... (rows: 100000)


Chunk 845: 100%|██████████| 1216/1216 [00:00<00:00, 18633.38it/s]


Processing chunk 846... (rows: 100000)


Chunk 846: 100%|██████████| 1075/1075 [00:00<00:00, 19885.49it/s]


Processing chunk 847... (rows: 100000)


Chunk 847: 100%|██████████| 964/964 [00:00<00:00, 18351.57it/s]


Processing chunk 848... (rows: 100000)


Chunk 848: 100%|██████████| 1084/1084 [00:00<00:00, 20694.98it/s]


Processing chunk 849... (rows: 100000)


Chunk 849: 100%|██████████| 864/864 [00:00<00:00, 19353.26it/s]


Processing chunk 850... (rows: 100000)


Chunk 850: 100%|██████████| 970/970 [00:00<00:00, 17772.16it/s]


Processing chunk 851... (rows: 100000)


Chunk 851: 100%|██████████| 1000/1000 [00:00<00:00, 20541.18it/s]


Processing chunk 852... (rows: 100000)


Chunk 852: 100%|██████████| 1177/1177 [00:00<00:00, 19316.11it/s]


Processing chunk 853... (rows: 100000)


Chunk 853: 100%|██████████| 1657/1657 [00:00<00:00, 21561.30it/s]


Processing chunk 854... (rows: 100000)


Chunk 854: 100%|██████████| 1494/1494 [00:00<00:00, 19867.44it/s]


Processing chunk 855... (rows: 100000)


Chunk 855: 100%|██████████| 1017/1017 [00:00<00:00, 18734.36it/s]


Processing chunk 856... (rows: 100000)


Chunk 856: 100%|██████████| 1370/1370 [00:00<00:00, 21601.67it/s]


Processing chunk 857... (rows: 100000)


Chunk 857: 100%|██████████| 1273/1273 [00:00<00:00, 20490.17it/s]


Processing chunk 858... (rows: 100000)


Chunk 858: 100%|██████████| 1098/1098 [00:00<00:00, 16554.68it/s]


Processing chunk 859... (rows: 100000)


Chunk 859: 100%|██████████| 997/997 [00:00<00:00, 20267.74it/s]


Processing chunk 860... (rows: 100000)


Chunk 860: 100%|██████████| 899/899 [00:00<00:00, 20230.05it/s]


Processing chunk 861... (rows: 100000)


Chunk 861: 100%|██████████| 1120/1120 [00:00<00:00, 20306.92it/s]


Processing chunk 862... (rows: 100000)


Chunk 862: 100%|██████████| 944/944 [00:00<00:00, 17400.23it/s]


Processing chunk 863... (rows: 100000)


Chunk 863: 100%|██████████| 1163/1163 [00:00<00:00, 20507.41it/s]


Processing chunk 864... (rows: 100000)


Chunk 864: 100%|██████████| 860/860 [00:00<00:00, 20616.72it/s]


Processing chunk 865... (rows: 100000)


Chunk 865: 100%|██████████| 767/767 [00:00<00:00, 19329.17it/s]


Processing chunk 866... (rows: 100000)


Chunk 866: 100%|██████████| 1050/1050 [00:00<00:00, 18466.03it/s]


Processing chunk 867... (rows: 100000)


Chunk 867: 100%|██████████| 1159/1159 [00:00<00:00, 20809.12it/s]


Processing chunk 868... (rows: 100000)


Chunk 868: 100%|██████████| 1226/1226 [00:00<00:00, 18637.64it/s]


Processing chunk 869... (rows: 100000)


Chunk 869: 100%|██████████| 1017/1017 [00:00<00:00, 18525.02it/s]


Processing chunk 870... (rows: 100000)


Chunk 870: 100%|██████████| 1268/1268 [00:00<00:00, 19436.67it/s]


Processing chunk 871... (rows: 100000)


Chunk 871: 100%|██████████| 1281/1281 [00:00<00:00, 20765.73it/s]


Processing chunk 872... (rows: 100000)


Chunk 872: 100%|██████████| 1002/1002 [00:00<00:00, 21272.24it/s]


Processing chunk 873... (rows: 100000)


Chunk 873: 100%|██████████| 856/856 [00:00<00:00, 20086.85it/s]


Processing chunk 874... (rows: 100000)


Chunk 874: 100%|██████████| 1052/1052 [00:00<00:00, 21120.18it/s]


Processing chunk 875... (rows: 100000)


Chunk 875: 100%|██████████| 955/955 [00:00<00:00, 19763.66it/s]


Processing chunk 876... (rows: 100000)


Chunk 876: 100%|██████████| 946/946 [00:00<00:00, 16924.06it/s]


Processing chunk 877... (rows: 100000)


Chunk 877: 100%|██████████| 877/877 [00:00<00:00, 19795.85it/s]


Processing chunk 878... (rows: 100000)


Chunk 878: 100%|██████████| 828/828 [00:00<00:00, 19634.46it/s]


Processing chunk 879... (rows: 100000)


Chunk 879: 100%|██████████| 1171/1171 [00:00<00:00, 14837.07it/s]


Processing chunk 880... (rows: 100000)


Chunk 880: 100%|██████████| 783/783 [00:00<00:00, 18335.06it/s]


Processing chunk 881... (rows: 100000)


Chunk 881: 100%|██████████| 860/860 [00:00<00:00, 19883.92it/s]


Processing chunk 882... (rows: 100000)


Chunk 882: 100%|██████████| 883/883 [00:00<00:00, 19136.54it/s]


Processing chunk 883... (rows: 100000)


Chunk 883: 100%|██████████| 816/816 [00:00<00:00, 17881.30it/s]


Processing chunk 884... (rows: 100000)


Chunk 884: 100%|██████████| 964/964 [00:00<00:00, 19909.74it/s]


Processing chunk 885... (rows: 100000)


Chunk 885: 100%|██████████| 1023/1023 [00:00<00:00, 19221.05it/s]


Processing chunk 886... (rows: 100000)


Chunk 886: 100%|██████████| 899/899 [00:00<00:00, 20009.87it/s]


Processing chunk 887... (rows: 100000)


Chunk 887: 100%|██████████| 1063/1063 [00:00<00:00, 19774.10it/s]


Processing chunk 888... (rows: 100000)


Chunk 888: 100%|██████████| 964/964 [00:00<00:00, 20397.27it/s]


Processing chunk 889... (rows: 100000)


Chunk 889: 100%|██████████| 1060/1060 [00:00<00:00, 19158.10it/s]


Processing chunk 890... (rows: 100000)


Chunk 890: 100%|██████████| 820/820 [00:00<00:00, 18496.29it/s]


Processing chunk 891... (rows: 100000)


Chunk 891: 100%|██████████| 864/864 [00:00<00:00, 19086.51it/s]


Processing chunk 892... (rows: 100000)


Chunk 892: 100%|██████████| 1109/1109 [00:00<00:00, 18908.31it/s]


Processing chunk 893... (rows: 100000)


Chunk 893: 100%|██████████| 857/857 [00:00<00:00, 18683.31it/s]


Processing chunk 894... (rows: 100000)


Chunk 894: 100%|██████████| 816/816 [00:00<00:00, 20435.10it/s]


Processing chunk 895... (rows: 100000)


Chunk 895: 100%|██████████| 739/739 [00:00<00:00, 19399.36it/s]


Processing chunk 896... (rows: 100000)


Chunk 896: 100%|██████████| 855/855 [00:00<00:00, 18974.33it/s]


Processing chunk 897... (rows: 100000)


Chunk 897: 100%|██████████| 710/710 [00:00<00:00, 17983.80it/s]


Processing chunk 898... (rows: 100000)


Chunk 898: 100%|██████████| 640/640 [00:00<00:00, 19649.19it/s]


Processing chunk 899... (rows: 100000)


Chunk 899: 100%|██████████| 1192/1192 [00:00<00:00, 19757.48it/s]


Processing chunk 900... (rows: 100000)


Chunk 900: 100%|██████████| 778/778 [00:00<00:00, 19038.99it/s]


Processing chunk 901... (rows: 100000)


Chunk 901: 100%|██████████| 1165/1165 [00:00<00:00, 20403.46it/s]


Processing chunk 902... (rows: 100000)


Chunk 902: 100%|██████████| 824/824 [00:00<00:00, 18525.05it/s]


Processing chunk 903... (rows: 100000)


Chunk 903: 100%|██████████| 1074/1074 [00:00<00:00, 20190.32it/s]


Processing chunk 904... (rows: 100000)


Chunk 904: 100%|██████████| 1173/1173 [00:00<00:00, 18848.39it/s]


Processing chunk 905... (rows: 100000)


Chunk 905: 100%|██████████| 900/900 [00:00<00:00, 19596.70it/s]


Processing chunk 906... (rows: 100000)


Chunk 906: 100%|██████████| 1092/1092 [00:00<00:00, 19638.29it/s]


Processing chunk 907... (rows: 100000)


Chunk 907: 100%|██████████| 1160/1160 [00:00<00:00, 21415.43it/s]


Processing chunk 908... (rows: 100000)


Chunk 908: 100%|██████████| 1158/1158 [00:00<00:00, 20328.15it/s]


Processing chunk 909... (rows: 100000)


Chunk 909: 100%|██████████| 1043/1043 [00:00<00:00, 18767.79it/s]


Processing chunk 910... (rows: 100000)


Chunk 910: 100%|██████████| 1026/1026 [00:00<00:00, 20601.95it/s]


Processing chunk 911... (rows: 100000)


Chunk 911: 100%|██████████| 820/820 [00:00<00:00, 19128.43it/s]


Processing chunk 912... (rows: 100000)


Chunk 912: 100%|██████████| 958/958 [00:00<00:00, 18225.76it/s]


Processing chunk 913... (rows: 100000)


Chunk 913: 100%|██████████| 1069/1069 [00:00<00:00, 20962.11it/s]


Processing chunk 914... (rows: 100000)


Chunk 914: 100%|██████████| 1333/1333 [00:00<00:00, 21243.88it/s]


Processing chunk 915... (rows: 100000)


Chunk 915: 100%|██████████| 1306/1306 [00:00<00:00, 19839.77it/s]


Processing chunk 916... (rows: 100000)


Chunk 916: 100%|██████████| 828/828 [00:00<00:00, 20940.54it/s]


Processing chunk 917... (rows: 100000)


Chunk 917: 100%|██████████| 885/885 [00:00<00:00, 20565.10it/s]


Processing chunk 918... (rows: 100000)


Chunk 918: 100%|██████████| 760/760 [00:00<00:00, 18442.47it/s]


Processing chunk 919... (rows: 100000)


Chunk 919: 100%|██████████| 856/856 [00:00<00:00, 19581.06it/s]


Processing chunk 920... (rows: 100000)


Chunk 920: 100%|██████████| 682/682 [00:00<00:00, 17704.61it/s]


Processing chunk 921... (rows: 100000)


Chunk 921: 100%|██████████| 789/789 [00:00<00:00, 19395.88it/s]


Processing chunk 922... (rows: 100000)


Chunk 922: 100%|██████████| 596/596 [00:00<00:00, 19801.69it/s]


Processing chunk 923... (rows: 100000)


Chunk 923: 100%|██████████| 776/776 [00:00<00:00, 18576.56it/s]


Processing chunk 924... (rows: 100000)


Chunk 924: 100%|██████████| 714/714 [00:00<00:00, 20312.78it/s]


Processing chunk 925... (rows: 100000)


Chunk 925: 100%|██████████| 691/691 [00:00<00:00, 18856.38it/s]


Processing chunk 926... (rows: 100000)


Chunk 926: 100%|██████████| 659/659 [00:00<00:00, 16885.55it/s]


Processing chunk 927... (rows: 100000)


Chunk 927: 100%|██████████| 860/860 [00:00<00:00, 20891.84it/s]


Processing chunk 928... (rows: 100000)


Chunk 928: 100%|██████████| 649/649 [00:00<00:00, 18885.79it/s]


Processing chunk 929... (rows: 100000)


Chunk 929: 100%|██████████| 651/651 [00:00<00:00, 17747.98it/s]


Processing chunk 930... (rows: 100000)


Chunk 930: 100%|██████████| 531/531 [00:00<00:00, 19519.33it/s]


Processing chunk 931... (rows: 100000)


Chunk 931: 100%|██████████| 561/561 [00:00<00:00, 19172.20it/s]


Processing chunk 932... (rows: 100000)


Chunk 932: 100%|██████████| 678/678 [00:00<00:00, 17920.31it/s]


Processing chunk 933... (rows: 100000)


Chunk 933: 100%|██████████| 456/456 [00:00<00:00, 18911.01it/s]


Processing chunk 934... (rows: 100000)


Chunk 934: 100%|██████████| 545/545 [00:00<00:00, 18782.27it/s]


Processing chunk 935... (rows: 100000)


Chunk 935: 100%|██████████| 691/691 [00:00<00:00, 19403.12it/s]


Processing chunk 936... (rows: 100000)


Chunk 936: 100%|██████████| 690/690 [00:00<00:00, 18482.18it/s]


Processing chunk 937... (rows: 100000)


Chunk 937: 100%|██████████| 623/623 [00:00<00:00, 18991.30it/s]


Processing chunk 938... (rows: 100000)


Chunk 938: 100%|██████████| 725/725 [00:00<00:00, 18634.61it/s]


Processing chunk 939... (rows: 100000)


Chunk 939: 100%|██████████| 599/599 [00:00<00:00, 19267.22it/s]


Processing chunk 940... (rows: 100000)


Chunk 940: 100%|██████████| 677/677 [00:00<00:00, 18983.19it/s]


Processing chunk 941... (rows: 100000)


Chunk 941: 100%|██████████| 838/838 [00:00<00:00, 19054.27it/s]


Processing chunk 942... (rows: 100000)


Chunk 942: 100%|██████████| 757/757 [00:00<00:00, 20848.69it/s]


Processing chunk 943... (rows: 100000)


Chunk 943: 100%|██████████| 536/536 [00:00<00:00, 18632.54it/s]


Processing chunk 944... (rows: 100000)


Chunk 944: 100%|██████████| 761/761 [00:00<00:00, 19035.23it/s]


Processing chunk 945... (rows: 100000)


Chunk 945: 100%|██████████| 573/573 [00:00<00:00, 18680.04it/s]


Processing chunk 946... (rows: 100000)


Chunk 946: 100%|██████████| 570/570 [00:00<00:00, 17309.00it/s]


Processing chunk 947... (rows: 100000)


Chunk 947: 100%|██████████| 696/696 [00:00<00:00, 18314.93it/s]


Processing chunk 948... (rows: 100000)


Chunk 948: 100%|██████████| 747/747 [00:00<00:00, 18805.15it/s]


Processing chunk 949... (rows: 100000)


Chunk 949: 100%|██████████| 826/826 [00:00<00:00, 19260.89it/s]


Processing chunk 950... (rows: 100000)


Chunk 950: 100%|██████████| 720/720 [00:00<00:00, 19542.73it/s]


Processing chunk 951... (rows: 100000)


Chunk 951: 100%|██████████| 806/806 [00:00<00:00, 19511.43it/s]


Processing chunk 952... (rows: 12527)


Chunk 952: 100%|██████████| 122/122 [00:00<00:00, 15433.72it/s]


Step 3 Complete:
  Principals rows processed: 39,624,377
  Principals rows kept: 1,493,177
  Principals rows filtered out: 38,131,200
  Total persons: 957,242
  ACTED_IN triples: 1,493,177





In [7]:
# Step 4: Create entity and relation mappings
# Map string entities/relations to integer IDs

print("=" * 60)
print("Step 4: Creating entity and relation mappings")
print("=" * 60)

# Collect all entities
all_entities = set()
all_entities.update(movies)
all_entities.update(persons)
all_entities.update(genres)

# Create entity mapping (string -> integer ID)
entity_map = {entity: idx for idx, entity in enumerate(sorted(all_entities), start=1)}
# Reserve 0 for padding if needed (optional)

# Create relation mapping
relations = {'HAS_GENRE', 'DIRECTED', 'WROTE', 'ACTED_IN'}
relation_map = {rel: idx for idx, rel in enumerate(sorted(relations), start=1)}

print(f"Entity mapping created: {len(entity_map)} entities")
print(f"Relation mapping created: {len(relation_map)} relations")
print(f"\nRelations: {sorted(relations)}")


Step 4: Creating entity and relation mappings
Entity mapping created: 1401319 entities
Relation mapping created: 4 relations

Relations: ['ACTED_IN', 'DIRECTED', 'HAS_GENRE', 'WROTE']


In [8]:
# Step 5: Convert triples to integer IDs and create final dataset

print("=" * 60)
print("Step 5: Converting triples to integer IDs")
print("=" * 60)

# Filter triples to only include entities that exist in our entity map
# (should be all, but safety check)
valid_triples = []
missing_entities = set()

for rel, head, tail in tqdm(triples, desc="Converting triples"):
    if head in entity_map and tail in entity_map and rel in relation_map:
        valid_triples.append({
            'head': entity_map[head],
            'relation': relation_map[rel],
            'tail': entity_map[tail],
            'head_str': head,
            'relation_str': rel,
            'tail_str': tail
        })
    else:
        if head not in entity_map:
            missing_entities.add(head)
        if tail not in entity_map:
            missing_entities.add(tail)

if missing_entities:
    print(f"Warning: {len(missing_entities)} entities not found in mapping (should not happen)")

print(f"Valid triples: {len(valid_triples)}")
print(f"Original triples: {len(triples)}")
print(f"Filtered out: {len(triples) - len(valid_triples)}")


Step 5: Converting triples to integer IDs


Converting triples: 100%|██████████| 3262812/3262812 [00:05<00:00, 633795.17it/s]

Valid triples: 3262812
Original triples: 3262812
Filtered out: 0





In [9]:
# Step 6: Save outputs to Google Drive

print("=" * 60)
print("Step 6: Saving outputs")
print("=" * 60)

# Determine output filenames based on mode
if USE_FILTERED_MODE:
    triples_filename = 'triples_filtered.csv'
    entity_map_filename = 'entity_map_filtered.csv'
    relation_map_filename = 'relation_map_filtered.csv'
else:
    triples_filename = 'triples.csv'
    entity_map_filename = 'entity_map.csv'
    relation_map_filename = 'relation_map.csv'

# Save triples.csv (with integer IDs)
triples_df = pd.DataFrame(valid_triples)
triples_df[['head', 'relation', 'tail']].to_csv(
    os.path.join(OUTPUT_DIR, triples_filename),
    index=False
)
print(f"Saved: {OUTPUT_DIR}/{triples_filename}")
print(f"  Shape: {triples_df.shape}")

# Save entity_map.csv
entity_map_df = pd.DataFrame([
    {'entity_id': idx, 'entity': entity}
    for entity, idx in sorted(entity_map.items(), key=lambda x: x[1])
])
entity_map_df.to_csv(
    os.path.join(OUTPUT_DIR, entity_map_filename),
    index=False
)
print(f"Saved: {OUTPUT_DIR}/{entity_map_filename}")
print(f"  Shape: {entity_map_df.shape}")

# Save relation_map.csv
relation_map_df = pd.DataFrame([
    {'relation_id': idx, 'relation': rel}
    for rel, idx in sorted(relation_map.items(), key=lambda x: x[1])
])
relation_map_df.to_csv(
    os.path.join(OUTPUT_DIR, relation_map_filename),
    index=False
)
print(f"Saved: {OUTPUT_DIR}/{relation_map_filename}")
print(f"  Shape: {relation_map_df.shape}")

print("\nAll outputs saved successfully!")


Step 6: Saving outputs
Saved: /content/drive/MyDrive/kg_output_filtered/triples_filtered.csv
  Shape: (3262812, 6)
Saved: /content/drive/MyDrive/kg_output_filtered/entity_map_filtered.csv
  Shape: (1401319, 2)
Saved: /content/drive/MyDrive/kg_output_filtered/relation_map_filtered.csv
  Shape: (4, 2)

All outputs saved successfully!


In [10]:
# Step 7: Print statistics

print("=" * 60)
print("KNOWLEDGE GRAPH STATISTICS")
print("=" * 60)

print(f"\nMode: {'FILTERED KG' if USE_FILTERED_MODE else 'FULL KG'}")

print(f"\nEntities:")
print(f"  Movies: {len(movies):,}")
print(f"  Persons: {len(persons):,}")
print(f"  Genres: {len(genres)}")
print(f"  Total entities: {len(all_entities):,}")

print(f"\nRelations:")
for rel in sorted(relations):
    count = sum(1 for t in valid_triples if t['relation_str'] == rel)
    print(f"  {rel}: {count:,} triples")

print(f"\nTotal triples: {len(valid_triples):,}")

# Comparison with full KG if in filtered mode
if USE_FILTERED_MODE and FULL_KG_STATS.get('triples'):
    reduction_ratio = len(valid_triples) / FULL_KG_STATS['triples']
    print(f"\nReduction vs Full KG:")
    print(f"  Full KG triples: {FULL_KG_STATS['triples']:,}")
    print(f"  Filtered KG triples: {len(valid_triples):,}")
    print(f"  Reduction ratio: {reduction_ratio:.2%}")
    print(f"  Size reduction: {(1 - reduction_ratio):.2%}")

if USE_FILTERED_MODE and FULL_KG_STATS.get('movies'):
    movie_reduction = len(movies) / FULL_KG_STATS['movies']
    print(f"\n  Full KG movies: {FULL_KG_STATS['movies']:,}")
    print(f"  Filtered KG movies: {len(movies):,}")
    print(f"  Movie reduction ratio: {movie_reduction:.2%}")

print(f"\nOutput files:")
if USE_FILTERED_MODE:
    print(f"  {OUTPUT_DIR}/triples_filtered.csv")
    print(f"  {OUTPUT_DIR}/entity_map_filtered.csv")
    print(f"  {OUTPUT_DIR}/relation_map_filtered.csv")
else:
    print(f"  {OUTPUT_DIR}/triples.csv")
    print(f"  {OUTPUT_DIR}/entity_map.csv")
    print(f"  {OUTPUT_DIR}/relation_map.csv")

print("\n" + "=" * 60)
print("Pipeline complete!")
print("=" * 60)


KNOWLEDGE GRAPH STATISTICS

Mode: FILTERED KG

Entities:
  Movies: 444,051
  Persons: 957,242
  Genres: 26
  Total entities: 1,401,319

Relations:
  ACTED_IN: 1,493,177 triples
  DIRECTED: 493,393 triples
  HAS_GENRE: 689,724 triples
  WROTE: 586,518 triples

Total triples: 3,262,812

Reduction vs Full KG:
  Full KG triples: 81,000,000
  Filtered KG triples: 3,262,812
  Reduction ratio: 4.03%
  Size reduction: 95.97%

  Full KG movies: 16,500,000
  Filtered KG movies: 444,051
  Movie reduction ratio: 2.69%

Output files:
  /content/drive/MyDrive/kg_output_filtered/triples_filtered.csv
  /content/drive/MyDrive/kg_output_filtered/entity_map_filtered.csv
  /content/drive/MyDrive/kg_output_filtered/relation_map_filtered.csv

Pipeline complete!


# TransE Training Stage with PyKEEN

This section trains TransE embeddings on the filtered Knowledge Graph.

**Steps:**
1. Load filtered triples from Google Drive
2. Create PyKEEN TriplesFactory
3. Train/validation/test split
4. Train TransE model
5. Evaluate with ranking metrics
6. Save embeddings and metadata

In [11]:
# Install PyKEEN and dependencies
%pip install pykeen -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.3/730.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
# TransE Training Configuration
import pandas as pd
import numpy as np
import random
import json
import torch
from pathlib import Path
from pykeen.triples import TriplesFactory
from pykeen.evaluation import RankBasedEvaluator
from pykeen.pipeline import pipeline
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# ============================================================================
# TRANSE TRAINING CONFIGURATION
# ============================================================================
EMBEDDING_DIM = 128
NUM_EPOCHS = 30
BATCH_SIZE = 512  # Reasonable for Colab GPU
LEARNING_RATE = 0.001
MARGIN = 1.0  # Margin for MarginRankingLoss
NEGATIVE_SAMPLER_KWARGS = {'num_negs_per_pos': 1}  # Default negative sampling

# Train/validation/test split ratios
TRAIN_RATIO = 0.8
VALID_RATIO = 0.1
TEST_RATIO = 0.1

# Paths
FILTERED_KG_DIR = '/content/drive/MyDrive/kg_output_filtered'
TRIPLES_FILE = f'{FILTERED_KG_DIR}/triples_filtered.csv'
ENTITY_MAP_FILE = f'{FILTERED_KG_DIR}/entity_map_filtered.csv'
RELATION_MAP_FILE = f'{FILTERED_KG_DIR}/relation_map_filtered.csv'
OUTPUT_DIR = f'{FILTERED_KG_DIR}/pykeen_transe'

# Dry-run mode: limit triples for quick testing
DRY_RUN_LIMIT = None  # Set to integer (e.g., 10000) for quick testing

# Device configuration
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print("=" * 60)
print("TRANSE TRAINING CONFIGURATION")
print("=" * 60)
print(f"Embedding dimension: {EMBEDDING_DIM}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Margin: {MARGIN}")
print(f"Device: {DEVICE}")
print(f"Train/Valid/Test split: {TRAIN_RATIO:.0%}/{VALID_RATIO:.0%}/{TEST_RATIO:.0%}")
print(f"Dry-run limit: {DRY_RUN_LIMIT if DRY_RUN_LIMIT else 'None (full dataset)'}")
print("=" * 60)

INFO:pykeen.utils:Using opt_einsum


TRANSE TRAINING CONFIGURATION
Embedding dimension: 128
Epochs: 30
Batch size: 512
Learning rate: 0.001
Margin: 1.0
Device: cuda
Train/Valid/Test split: 80%/10%/10%
Dry-run limit: None (full dataset)


In [13]:
# Mount Google Drive (if not already mounted)
from google.colab import drive
try:
    drive.mount('/content/drive', force_remount=False)
except:
    print("Drive already mounted or not in Colab environment")

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print(f"Output directory ready: {OUTPUT_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Output directory ready: /content/drive/MyDrive/kg_output_filtered/pykeen_transe


In [14]:
# Helper function: Load triples from CSV
def load_triples(filepath, limit=None):
    """
    Load triples from CSV file.

    Args:
        filepath: Path to triples CSV file
        limit: Optional limit on number of rows to read (for dry-run)

    Returns:
        DataFrame with columns: head, relation, tail
    """
    print(f"Loading triples from: {filepath}")

    if limit:
        print(f"  Dry-run mode: limiting to {limit:,} rows")
        triples_df = pd.read_csv(filepath, nrows=limit)
    else:
        triples_df = pd.read_csv(filepath)

    print(f"  Loaded {len(triples_df):,} triples")
    print(f"  Columns: {list(triples_df.columns)}")

    # Ensure we have the expected columns
    required_cols = ['head', 'relation', 'tail']
    if not all(col in triples_df.columns for col in required_cols):
        raise ValueError(f"Expected columns {required_cols}, got {list(triples_df.columns)}")

    # Remove any rows with missing values
    initial_count = len(triples_df)
    triples_df = triples_df.dropna(subset=required_cols)
    if len(triples_df) < initial_count:
        print(f"  Removed {initial_count - len(triples_df):,} rows with missing values")

    # Convert to string and strip whitespace
    for col in required_cols:
        triples_df[col] = triples_df[col].astype(str).str.strip()

    # Remove empty strings
    triples_df = triples_df[
        (triples_df['head'] != '') &
        (triples_df['relation'] != '') &
        (triples_df['tail'] != '')
    ]

    print(f"  Final triples count: {len(triples_df):,}")
    return triples_df

# Load triples
triples_df = load_triples(TRIPLES_FILE, limit=DRY_RUN_LIMIT)

Loading triples from: /content/drive/MyDrive/kg_output_filtered/triples_filtered.csv
  Loaded 3,262,812 triples
  Columns: ['head', 'relation', 'tail']
  Final triples count: 3,262,812


In [15]:
# Helper function: Build TriplesFactory from labeled triples
def build_factory(triples_df, create_inverse_triples=False):
    """
    Create PyKEEN TriplesFactory from DataFrame with string labels.

    Args:
        triples_df: DataFrame with columns head, relation, tail (string labels)
        create_inverse_triples: Whether to create inverse triples

    Returns:
        TriplesFactory object
    """
    print("=" * 60)
    print("Building TriplesFactory")
    print("=" * 60)

    # Convert to numpy array of string triples
    triples_array = triples_df[['head', 'relation', 'tail']].values

    print(f"  Input triples shape: {triples_array.shape}")
    print(f"  Unique entities: {len(set(triples_array[:, 0]) | set(triples_array[:, 2])):,}")
    print(f"  Unique relations: {len(set(triples_array[:, 1]))}")

    # Create TriplesFactory from labeled triples
    factory = TriplesFactory.from_labeled_triples(
        triples=triples_array,
        create_inverse_triples=create_inverse_triples
    )

    print(f"  Factory created successfully")
    print(f"  Number of entities: {factory.num_entities:,}")
    print(f"  Number of relations: {factory.num_relations}")
    print(f"  Number of triples: {factory.num_triples:,}")

    return factory

# Build TriplesFactory
triples_factory = build_factory(triples_df, create_inverse_triples=False)

Building TriplesFactory
  Input triples shape: (3262812, 3)
  Unique entities: 1,401,319
  Unique relations: 4
  Factory created successfully
  Number of entities: 1,401,319
  Number of relations: 4
  Number of triples: 3,238,794


In [16]:
# Create train/validation/test split
print("=" * 60)
print("Creating train/validation/test split")
print("=" * 60)

training, validation, testing = triples_factory.split(
    ratios=[TRAIN_RATIO, VALID_RATIO, TEST_RATIO],
    random_state=RANDOM_SEED
)

print(f"Training triples: {training.num_triples:,} ({training.num_triples/triples_factory.num_triples:.1%})")
print(f"Validation triples: {validation.num_triples:,} ({validation.num_triples/triples_factory.num_triples:.1%})")
print(f"Test triples: {testing.num_triples:,} ({testing.num_triples/triples_factory.num_triples:.1%})")
print(f"Random seed: {RANDOM_SEED}")

Creating train/validation/test split


INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [1189743, 323879, 323880]


Training triples: 2,591,035 (80.0%)
Validation triples: 323,879 (10.0%)
Test triples: 323,880 (10.0%)
Random seed: 42


In [17]:
# Helper function: Train TransE model
def train_model(training, validation, testing, device='cuda'):
    """
    Train TransE model using PyKEEN pipeline.

    Args:
        training: Training TriplesFactory
        validation: Validation TriplesFactory
        testing: Testing TriplesFactory
        device: Device to use ('cuda' or 'cpu')

    Returns:
        Trained model and training results
    """
    print("=" * 60)
    print("Training TransE Model")
    print("=" * 60)
    print(f"Device: {device}")
    print(f"Embedding dimension: {EMBEDDING_DIM}")
    print(f"Epochs: {NUM_EPOCHS}")
    print(f"Batch size: {BATCH_SIZE}")
    print(f"Learning rate: {LEARNING_RATE}")
    print("=" * 60)

    # Use PyKEEN pipeline for training
    # PyKEEN requires both training and testing to be specified
    result = pipeline(
        training=training,
        validation=validation,
        testing=testing,
        model='TransE',
        model_kwargs=dict(
            embedding_dim=EMBEDDING_DIM,
        ),
        training_kwargs=dict(
            num_epochs=NUM_EPOCHS,
            batch_size=BATCH_SIZE,
            use_tqdm=True,
        ),
        training_loop='SLCWA',
        negative_sampler='basic',
        negative_sampler_kwargs=NEGATIVE_SAMPLER_KWARGS,
        loss='marginranking',
        loss_kwargs=dict(margin=MARGIN),
        optimizer='Adam',
        optimizer_kwargs={'lr': LEARNING_RATE},
        device=device,
        random_seed=RANDOM_SEED,
    )

    print("\nTraining completed!")
    return result

# Train the model
training_result = train_model(training, validation, testing, device=DEVICE)
model = training_result.model

INFO:pykeen.pipeline.api:Using device: cuda
INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training TransE Model
Device: cuda
Embedding dimension: 128
Epochs: 30
Batch size: 512
Learning rate: 0.001


INFO:pykeen.nn.representation:Inferred unique=False for Embedding()


Training epochs on cuda:0:   0%|          | 0/30 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0.00/5.06k [00:00<?, ?batch/s]

Evaluating on cuda:0:   0%|          | 0.00/324k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 6796.25s seconds



Training completed!


In [18]:
# Fast evaluation: evaluate on subset of test triples for faster runtime
FAST_EVAL = True  # Set to True to evaluate on random subset (20,000 triples)
FAST_EVAL_SIZE = 20000  # Number of triples to use for fast evaluation

# Helper function: Evaluate model with ranking metrics
def evaluate_model(model, testing, training, validation, device='cuda'):
    """
    Evaluate model on test set using ranking metrics.

    Args:
        model: Trained PyKEEN model
        testing: Test TriplesFactory
        training: Training TriplesFactory (for filtering)
        validation: Validation TriplesFactory (for filtering)
        device: Device to use

    Returns:
        Dictionary of evaluation metrics
    """
    print("=" * 60)
    print("Evaluating Model on Test Set")
    print("=" * 60)

    evaluator = RankBasedEvaluator(metrics=["hits@k", "mrr", "mr"])

    # Move model to device
    model = model.to(device)

    # Prepare test triples
    test_triples = testing.mapped_triples

    # Fast evaluation: use random subset if enabled
    if FAST_EVAL and len(test_triples) > FAST_EVAL_SIZE:
        print(f"Fast evaluation enabled: sampling {FAST_EVAL_SIZE:,} triples from {len(test_triples):,} test triples")
        # Set random seed for reproducibility
        generator = torch.Generator(device=device)
        generator.manual_seed(RANDOM_SEED)

        # Randomly sample indices
        indices = torch.randperm(len(test_triples))[:FAST_EVAL_SIZE]
        test_triples = test_triples[indices]
        print(f"  Using {len(test_triples):,} triples for evaluation")
    else:
        print(f"Evaluating on full test set: {len(test_triples):,} triples")

    # Prepare additional filter triples (training and validation sets)
    # These are used to filter out known triples during evaluation
    additional_filter_triples = [
        training.mapped_triples,
        validation.mapped_triples
    ]

    # Evaluate
    metrics = evaluator.evaluate(
        model=model,
        mapped_triples=test_triples,
        additional_filter_triples=additional_filter_triples,
        batch_size=BATCH_SIZE,
        device=device,
    )

    # Convert RankBasedMetricResults to dictionary
    metrics_dict = metrics.to_dict()

    print("\nRanking Metrics:")
    print(f"  Hits@1: {metrics_dict.get('hits_at_1', 0):.4f}")
    print(f"  Hits@3: {metrics_dict.get('hits_at_3', 0):.4f}")
    print(f"  Hits@10: {metrics_dict.get('hits_at_10', 0):.4f}")
    print(f"  Mean Rank: {metrics_dict.get('mean_rank', 0):.2f}")
    print(f"  Mean Reciprocal Rank: {metrics_dict.get('mean_reciprocal_rank', 0):.4f}")

    return metrics_dict

# Evaluate model
test_metrics = evaluate_model(model, testing, training, validation, device=DEVICE)

Evaluating Model on Test Set
Fast evaluation enabled: sampling 20,000 triples from 323,880 test triples
  Using 20,000 triples for evaluation


Evaluating on cuda:0:   0%|          | 0.00/20.0k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 427.21s seconds



Ranking Metrics:
  Hits@1: 0.0000
  Hits@3: 0.0000
  Hits@10: 0.0000
  Mean Rank: 0.00
  Mean Reciprocal Rank: 0.0000


In [20]:
m = test_metrics["both"]["realistic"]

print("Hits@1:", m["hits_at_1"])
print("Hits@3:", m["hits_at_3"])
print("Hits@10:", m["hits_at_10"])

# نزدیک‌ترین معادل MR/MRR در این خروجی‌ها:
print("Mean Rank (arithmetic):", m["arithmetic_mean_rank"])
print("Harmonic Mean Rank:", m["harmonic_mean_rank"])
print("Inverse Harmonic Mean Rank:", m["inverse_harmonic_mean_rank"])


Hits@1: 0.0005
Hits@3: 0.061375
Hits@10: 0.087825
Mean Rank (arithmetic): 171739.734375
Harmonic Mean Rank: 27.99739646911621
Inverse Harmonic Mean Rank: 0.03571760654449463


In [None]:
import torch, gc
torch.cuda.empty_cache()
gc.collect()

5692

In [21]:
# Helper function: Compute Precision@K, Recall@K, F1@K
def compute_precision_recall_f1(model, testing, k=10, device='cuda', batch_size=512):
    """
    Compute Precision@K, Recall@K, and F1@K for test triples.

    Args:
        model: Trained PyKEEN model
        testing: Test TriplesFactory
        k: Top-K for precision/recall
        device: Device to use
        batch_size: Batch size for evaluation

    Returns:
        Dictionary with precision@k, recall@k, f1@k
    """
    print(f"\nComputing Precision@{k}, Recall@{k}, F1@{k}...")

    model = model.to(device)
    model.eval()

    test_triples = testing.mapped_triples.to(device)
    num_triples = len(test_triples)

    true_positives = 0
    total_positive = num_triples

    # Process in batches
    with torch.no_grad():
        for i in range(0, num_triples, batch_size):
            batch = test_triples[i:i+batch_size]

            # For each triple, get top-k predictions for tail
            h, r, t = batch[:, 0], batch[:, 1], batch[:, 2]

            # Get scores for all possible tails
            scores = model.score_t(batch)

            # Get top-k predictions
            _, top_k_indices = torch.topk(scores, k=k, dim=1)

            # Check if true tail is in top-k
            true_tails = t.unsqueeze(1)
            matches = (top_k_indices == true_tails).any(dim=1)
            true_positives += matches.sum().item()

    precision_k = true_positives / (num_triples * k) if num_triples > 0 else 0.0
    recall_k = true_positives / total_positive if total_positive > 0 else 0.0
    f1_k = 2 * (precision_k * recall_k) / (precision_k + recall_k) if (precision_k + recall_k) > 0 else 0.0

    return {
        f'precision_at_{k}': precision_k,
        f'recall_at_{k}': recall_k,
        f'f1_at_{k}': f1_k
    }

# Compute Precision@10, Recall@10, F1@10
prf_metrics = compute_precision_recall_f1(model, testing, k=10, device=DEVICE, batch_size=BATCH_SIZE)
print(f"  Precision@10: {prf_metrics['precision_at_10']:.4f}")
print(f"  Recall@10: {prf_metrics['recall_at_10']:.4f}")
print(f"  F1@10: {prf_metrics['f1_at_10']:.4f}")

# Merge with test_metrics
test_metrics.update(prf_metrics)


Computing Precision@10, Recall@10, F1@10...


OutOfMemoryError: CUDA out of memory. Tried to allocate 342.12 GiB. GPU 0 has a total capacity of 14.74 GiB of which 2.59 GiB is free. Process 2932 has 12.15 GiB memory in use. Of the allocated memory 2.21 GiB is allocated by PyTorch, and 9.83 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Step 1: MovieLens to IMDb Linkage

This section links MovieLens 20M movies to IMDb titles (movieId -> tconst).

**Goal:** Create a high-coverage mapping using title/year matching.

**Approach:**
1. Parse MovieLens titles to extract year and normalize titles
2. Build an index from IMDb title.basics.tsv (filtered for movies)
3. Match using normalized title + year, with fallback strategies
4. Output linkage CSV and summary statistics

In [26]:
# MovieLens-IMDb Linkage Configuration
import pandas as pd
import numpy as np
import json
import re
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# ============================================================================
# LINKAGE CONFIGURATION
# ============================================================================
DRIVE_ROOT = '/content/drive/MyDrive'

# Input paths
ML_MOVIES_FILE = f'{DRIVE_ROOT}/Knowledge Graph/movies.csv'  # MovieLens movies file
IMDB_BASICS_FILE = f'{DRIVE_ROOT}/Knowledge Graph/title.basics.tsv'  # IMDb title.basics.tsv

# Output path
OUTPUT_DIR = f'{DRIVE_ROOT}/ml_imdb_linkage'

# Processing configuration
IMDB_CHUNKSIZE = 100000  # Chunk size for reading IMDb TSV

print("=" * 60)
print("MOVIELENS-IMDB LINKAGE CONFIGURATION")
print("=" * 60)
print(f"MovieLens movies file: {ML_MOVIES_FILE}")
print(f"IMDb basics file: {IMDB_BASICS_FILE}")
print(f"Output directory: {OUTPUT_DIR}")
print("=" * 60)

MOVIELENS-IMDB LINKAGE CONFIGURATION
MovieLens movies file: /content/drive/MyDrive/Knowledge Graph/movies.csv
IMDb basics file: /content/drive/MyDrive/Knowledge Graph/title.basics.tsv
Output directory: /content/drive/MyDrive/ml_imdb_linkage


In [27]:
# Mount Google Drive (if not already mounted)
from google.colab import drive
try:
    drive.mount('/content/drive', force_remount=False)
except:
    print("Drive already mounted or not in Colab environment")

# Create output directory
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
print(f"Output directory ready: {OUTPUT_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Output directory ready: /content/drive/MyDrive/ml_imdb_linkage


In [36]:
# Helper function: Process MovieLens title (handle articles and alternate titles)
def process_ml_title(ml_title):
    """
    Process MovieLens title to handle:
    - Trailing articles: "X, The" -> "The X"
    - Alternate titles: remove "(a.k.a. ...)" or "(...)" before extracting year
    - Extract main title and year

    Args:
        ml_title: MovieLens title string

    Returns:
        Tuple of (main_title, year_int_or_None, original_title)
    """
    import unicodedata

    if pd.isna(ml_title) or ml_title == '':
        return '', None, str(ml_title) if not pd.isna(ml_title) else ''

    original_title = str(ml_title).strip()
    title_str = original_title

    # Step 1: Handle trailing articles before processing parentheses
    # Pattern: "X, The" -> "The X", "X, A" -> "A X", "X, An" -> "An X"
    article_patterns = [
        (r'^(.+?),\s+The\s*$', r'The \1'),
        (r'^(.+?),\s+A\s*$', r'A \1'),
        (r'^(.+?),\s+An\s*$', r'An \1')
    ]

    for pattern, replacement in article_patterns:
        if re.match(pattern, title_str, re.IGNORECASE):
            title_str = re.sub(pattern, replacement, title_str, flags=re.IGNORECASE).strip()
            break

    # Step 2: Remove alternate titles (everything from first "(" that's not the year)
    # Keep only the main title part before any parentheses
    # But we need to preserve the final "(YYYY)" if it exists
    # Strategy: extract year first, then strip everything from first "("

    # Extract year from last "(YYYY)" at end
    year = None
    year_match = re.search(r'\((\d{4})\)\s*$', title_str)
    if year_match:
        year = int(year_match.group(1))
        # Temporarily remove year for processing
        title_str_no_year = re.sub(r'\s*\(\d{4}\)\s*$', '', title_str).strip()
    else:
        title_str_no_year = title_str

    # Remove everything from first "(" onward (alternate titles, a.k.a., etc.)
    # Example: "Arena, The (a.k.a. Naked Warriors)" -> "Arena, The"
    # But we already moved "The" to front, so: "The Arena (a.k.a. Naked Warriors)" -> "The Arena"
    main_title = title_str_no_year
    first_paren_idx = main_title.find('(')
    if first_paren_idx >= 0:
        main_title = main_title[:first_paren_idx].strip()

    # If we removed the year parentheses, add it back conceptually
    # (year is already extracted above)

    return main_title, year, original_title

# Helper function: Extract title and year from MovieLens title
def extract_title_year(ml_title):
    """
    Extract title and year from MovieLens title format.
    Uses process_ml_title() to handle articles and alternate titles.

    Args:
        ml_title: MovieLens title string

    Returns:
        Tuple of (title_without_year, year_int_or_None)
    """
    main_title, year, _ = process_ml_title(ml_title)
    return main_title, year

# Helper function: Robust title normalization with unicode normalization
def normalize_title(text):
    """
    Robust title normalization for matching:
    - Unicode normalization (remove accents)
    - Lowercase
    - Strip leading/trailing whitespace
    - Convert "&" to "and"
    - Remove punctuation/symbols (keep alphanumerics and spaces only)
    - Collapse multiple spaces into one
    - Remove common stop words at beginning: "the", "a", "an"

    Args:
        text: Title string

    Returns:
        Normalized title string
    """
    import unicodedata

    if pd.isna(text) or text == '':
        return ''

    # Unicode normalization: remove accents (NFD + remove combining marks)
    # Convert to unicode string and normalize
    text_str = str(text)
    # Normalize to NFD (decomposed), then remove combining characters
    text_str = unicodedata.normalize('NFD', text_str)
    text_str = ''.join(c for c in text_str if unicodedata.category(c) != 'Mn')

    # Convert to string and lowercase
    title_str = text_str.lower().strip()

    # Convert "&" to "and"
    title_str = title_str.replace('&', 'and')

    # Remove punctuation/symbols (keep alphanumeric and spaces only)
    title_str = re.sub(r'[^\w\s]', ' ', title_str)

    # Collapse multiple spaces into one
    title_str = re.sub(r'\s+', ' ', title_str)

    # Strip leading/trailing spaces
    title_str = title_str.strip()

    # Remove common stop words at beginning only
    stop_words = ['the', 'a', 'an']
    words = title_str.split()
    if words and words[0] in stop_words:
        title_str = ' '.join(words[1:]).strip()

    return title_str

# Test normalization and extraction
test_titles = [
    "Toy Story (1995)",
    "The Shawshank Redemption (1994)",
    "Forrest Gump",
    "Pulp Fiction (1994)",
    "The Matrix (1999)",
    "Men in Black (1997)",
    "Men & Women (2000)",
    "Arena, The (a.k.a. Naked Warriors) (1974)",
    "Brute (Bandyta) (1998)",
    "Shakespeare, The (1996)",
    "Café Lumière (2003)"  # Test unicode normalization
]
print("Title extraction and normalization examples:")
for title in test_titles:
    title_clean, year = extract_title_year(title)
    norm = normalize_title(title_clean)
    print(f"  '{title}' -> clean: '{title_clean}', year: {year}, normalized: '{norm}'")

Title extraction and normalization examples:
  'Toy Story (1995)' -> clean: 'Toy Story', year: 1995, normalized: 'toy story'
  'The Shawshank Redemption (1994)' -> clean: 'The Shawshank Redemption', year: 1994, normalized: 'shawshank redemption'
  'Forrest Gump' -> clean: 'Forrest Gump', year: None, normalized: 'forrest gump'
  'Pulp Fiction (1994)' -> clean: 'Pulp Fiction', year: 1994, normalized: 'pulp fiction'
  'The Matrix (1999)' -> clean: 'The Matrix', year: 1999, normalized: 'matrix'
  'Men in Black (1997)' -> clean: 'Men in Black', year: 1997, normalized: 'men in black'
  'Men & Women (2000)' -> clean: 'Men & Women', year: 2000, normalized: 'men and women'
  'Arena, The (a.k.a. Naked Warriors) (1974)' -> clean: 'Arena, The', year: 1974, normalized: 'arena the'
  'Brute (Bandyta) (1998)' -> clean: 'Brute', year: 1998, normalized: 'brute'
  'Shakespeare, The (1996)' -> clean: 'Shakespeare, The', year: 1996, normalized: 'shakespeare the'
  'Café Lumière (2003)' -> clean: 'Café Lum

In [37]:
# Helper function: Load MovieLens movies
def load_movielens_movies(filepath):
    """
    Load MovieLens movies file and extract year/normalize titles.
    Handles articles and alternate titles using process_ml_title().

    Args:
        filepath: Path to movies.csv

    Returns:
        DataFrame with columns: movieId, title, genres, ml_title_clean, ml_title_norm, ml_year
    """
    print(f"Loading MovieLens movies from: {filepath}")

    df = pd.read_csv(filepath)
    print(f"  Loaded {len(df):,} movies")

    # Extract title and year using helper function (handles articles and alternates)
    title_year_pairs = df['title'].apply(extract_title_year)
    df['ml_title_clean'] = [pair[0] for pair in title_year_pairs]
    df['ml_year'] = [pair[1] for pair in title_year_pairs]
    df['ml_year'] = df['ml_year'].astype('Int64')  # Int64 allows NaN

    # Normalize cleaned titles (without year)
    df['ml_title_norm'] = df['ml_title_clean'].apply(normalize_title)

    # Count statistics
    movies_with_year = df['ml_year'].notna().sum()
    print(f"  Movies with year: {movies_with_year:,} ({movies_with_year/len(df)*100:.1f}%)")
    print(f"  Movies without year: {(df['ml_year'].isna().sum()):,}")
    print(f"  Sample processed titles:")
    for idx, row in df.head(5).iterrows():
        print(f"    '{row['title']}' -> clean: '{row['ml_title_clean']}' -> norm: '{row['ml_title_norm']}' (year: {row['ml_year']})")

    return df

# Load MovieLens movies
ml_movies = load_movielens_movies(ML_MOVIES_FILE)
print(f"\nSample MovieLens movies:")
print(ml_movies[['movieId', 'title', 'ml_year', 'ml_title_norm']].head(10))

Loading MovieLens movies from: /content/drive/MyDrive/Knowledge Graph/movies.csv
  Loaded 27,278 movies
  Movies with year: 27,255 (99.9%)
  Movies without year: 23
  Sample processed titles:
    'Toy Story (1995)' -> clean: 'Toy Story' -> norm: 'toy story' (year: 1995)
    'Jumanji (1995)' -> clean: 'Jumanji' -> norm: 'jumanji' (year: 1995)
    'Grumpier Old Men (1995)' -> clean: 'Grumpier Old Men' -> norm: 'grumpier old men' (year: 1995)
    'Waiting to Exhale (1995)' -> clean: 'Waiting to Exhale' -> norm: 'waiting to exhale' (year: 1995)
    'Father of the Bride Part II (1995)' -> clean: 'Father of the Bride Part II' -> norm: 'father of the bride part ii' (year: 1995)

Sample MovieLens movies:
   movieId                               title  ml_year  \
0        1                    Toy Story (1995)     1995   
1        2                      Jumanji (1995)     1995   
2        3             Grumpier Old Men (1995)     1995   
3        4            Waiting to Exhale (1995)     1995   

In [38]:
# Helper function: Stream IMDb basics and build index
def build_imdb_index_streaming(filepath, chunksize=100000):
    """
    Stream IMDb title.basics.tsv and build index for fast matching.

    Filter criteria:
    - titleType == "movie"
    - isAdult == 0

    Index structure:
    {
        normalized_title: [
            {'tconst': 'tt...', 'year': int or None, 'imdb_title_used': 'original title'},
            ...
        ]
    }

    Important: Include BOTH primaryTitle and originalTitle, avoid duplicates.

    Args:
        filepath: Path to title.basics.tsv
        chunksize: Chunk size for reading TSV

    Returns:
        Dictionary mapping normalized_title -> list of candidates
    """
    print("=" * 60)
    print("Building IMDb Index")
    print("=" * 60)
    print(f"Reading: {filepath}")
    print(f"Chunk size: {chunksize:,} rows")

    index = defaultdict(list)
    seen_tconst_title = set()  # Track (tconst, normalized_title) to avoid duplicates
    total_rows = 0
    filtered_rows = 0

    # Read in chunks
    chunks = pd.read_csv(filepath, sep='\t', chunksize=chunksize, low_memory=False)

    for chunk_idx, chunk in enumerate(chunks):
        total_rows += len(chunk)

        # Filter early: titleType == "movie" and isAdult == 0
        chunk = chunk[
            (chunk['titleType'] == 'movie') &
            (chunk['isAdult'].isin([0, '0']) | chunk['isAdult'].isna())
        ]
        filtered_rows += len(chunk)

        # Process each row
        for _, row in tqdm(chunk.iterrows(), total=len(chunk), desc=f"Chunk {chunk_idx + 1}", leave=False):
            tconst = str(row['tconst']).strip()
            if not tconst or tconst == '\\N':
                continue

            # Extract year (handle "\N" as missing)
            year = None
            start_year = row.get('startYear')
            if pd.notna(start_year) and str(start_year) != '\\N':
                try:
                    year = int(float(str(start_year)))
                except:
                    pass

            # Process primaryTitle
            if pd.notna(row.get('primaryTitle')):
                primary_title = str(row['primaryTitle']).strip()
                if primary_title:
                    title_norm = normalize_title(primary_title)
                    if title_norm:
                        key = (tconst, title_norm)
                        if key not in seen_tconst_title:
                            seen_tconst_title.add(key)
                            index[title_norm].append({
                                'tconst': tconst,
                                'year': year,
                                'imdb_title_used': primary_title
                            })

            # Process originalTitle (if different from primaryTitle)
            if pd.notna(row.get('originalTitle')):
                original_title = str(row['originalTitle']).strip()
                if original_title:
                    orig_title_norm = normalize_title(original_title)
                    if orig_title_norm:
                        key = (tconst, orig_title_norm)
                        if key not in seen_tconst_title:
                            seen_tconst_title.add(key)
                            index[orig_title_norm].append({
                                'tconst': tconst,
                                'year': year,
                                'imdb_title_used': original_title
                            })

        if (chunk_idx + 1) % 10 == 0:
            print(f"  Processed {chunk_idx + 1} chunks, {total_rows:,} total rows, {filtered_rows:,} movies")

    print(f"\nIndex built:")
    print(f"  Total rows processed: {total_rows:,}")
    print(f"  Movies filtered: {filtered_rows:,}")
    print(f"  Unique normalized titles in index: {len(index):,}")
    print(f"  Total index entries: {sum(len(v) for v in index.values()):,}")

    return dict(index)

# Build IMDb index
imdb_index = build_imdb_index_streaming(IMDB_BASICS_FILE, chunksize=IMDB_CHUNKSIZE)

Building IMDb Index
Reading: /content/drive/MyDrive/Knowledge Graph/title.basics.tsv
Chunk size: 100,000 rows




  Processed 10 chunks, 1,000,000 total rows, 220,833 movies




  Processed 20 chunks, 2,000,000 total rows, 261,783 movies




  Processed 30 chunks, 3,000,000 total rows, 299,779 movies




  Processed 40 chunks, 4,000,000 total rows, 339,379 movies




  Processed 50 chunks, 5,000,000 total rows, 378,827 movies




  Processed 60 chunks, 6,000,000 total rows, 426,129 movies




  Processed 70 chunks, 7,000,000 total rows, 471,946 movies




  Processed 80 chunks, 8,000,000 total rows, 517,916 movies




  Processed 90 chunks, 9,000,000 total rows, 568,267 movies




  Processed 100 chunks, 10,000,000 total rows, 627,273 movies




  Processed 110 chunks, 11,000,000 total rows, 680,031 movies




  Processed 120 chunks, 11,955,435 total rows, 718,637 movies

Index built:
  Total rows processed: 11,955,435
  Movies filtered: 718,637
  Unique normalized titles in index: 682,749
  Total index entries: 808,356


In [39]:
# Helper function: Relaxed title match (fallback)
def relaxed_match(norm_title, imdb_index):
    """
    Attempt relaxed matching strategies:
    1. Remove spaces from normalized title
    2. Remove subtitle after ":" and retry

    Simple deterministic approach without fuzzy matching.

    Args:
        norm_title: Normalized title
        imdb_index: IMDb index dictionary

    Returns:
        List of candidates or None
    """
    # Strategy 1: Remove all spaces
    title_no_spaces = norm_title.replace(' ', '')
    if title_no_spaces:
        candidates = []
        for key, values in imdb_index.items():
            key_no_spaces = key.replace(' ', '')
            if key_no_spaces == title_no_spaces:
                candidates.extend(values)
        if candidates:
            return candidates

    # Strategy 2: Remove subtitle after ":"
    if ':' in norm_title:
        title_no_subtitle = norm_title.split(':', 1)[0].strip()
        if title_no_subtitle:
            candidates = imdb_index.get(title_no_subtitle)
            if candidates:
                return candidates

    return None

# Helper function: Match MovieLens movies to IMDb
def match_movielens_to_imdb(ml_movies, imdb_index):
    """
    Match MovieLens movies to IMDb titles using two-stage strategy.

    Stage A: Title-only match
    - Lookup candidates by normalized_title
    - If exactly 1 candidate: match_type="title_only_unique"
    - If multiple candidates:
       - If ML year available: exact year match -> "title_year_exact"
       - Else: closest year (diff <= 1) -> "title_year_closest"
       - Else: ambiguous

    Stage B: Fallback (if still unmatched)
    - Attempt relaxed match (remove spaces)

    Args:
        ml_movies: DataFrame with MovieLens movies
        imdb_index: Dictionary mapping normalized_title -> candidates

    Returns:
        DataFrame with linkage results, match_type_counts, unmatched_samples, ambiguous_samples
    """
    print("=" * 60)
    print("Matching Movies")
    print("=" * 60)

    results = []
    match_type_counts = defaultdict(int)
    unmatched_samples = []
    ambiguous_samples = []

    # Track indices for random sampling
    unmatched_indices = []
    ambiguous_indices = []

    for idx, ml_row in tqdm(ml_movies.iterrows(), total=len(ml_movies), desc="Matching"):
        movie_id = ml_row['movieId']
        ml_title_raw = ml_row['title']
        ml_title_norm = ml_row['ml_title_norm']
        ml_year = ml_row['ml_year']

        # Initialize result
        result = {
            'movieId': movie_id,
            'ml_title_raw': ml_title_raw,
            'ml_title_norm': ml_title_norm,
            'ml_year': ml_year if pd.notna(ml_year) else None,
            'tconst': None,
            'imdb_title': None,
            'imdb_year': None,
            'match_type': 'unmatched'
        }

        # Skip if normalized title is empty
        if not ml_title_norm:
            results.append(result)
            match_type_counts['unmatched'] += 1
            unmatched_indices.append(idx)
            continue

        # STAGE A: Title-only match
        candidates = imdb_index.get(ml_title_norm, [])

        if not candidates:
            # No candidates found - try Stage B fallback
            relaxed_candidates = relaxed_match(ml_title_norm, imdb_index)
            if relaxed_candidates:
                candidates = relaxed_candidates
                # Use relaxed match logic
                if len(candidates) == 1:
                    match = candidates[0]
                    result['tconst'] = match['tconst']
                    result['imdb_title'] = match['imdb_title_used']
                    result['imdb_year'] = match['year']
                    result['match_type'] = 'title_only_unique'  # Relaxed match
                    results.append(result)
                    match_type_counts['title_only_unique'] += 1
                    continue
                # For multiple relaxed candidates, continue to main logic below
            else:
                # Still no candidates
                results.append(result)
                match_type_counts['unmatched'] += 1
                unmatched_indices.append(idx)
                continue

        # Process candidates
        if len(candidates) == 1:
            # Single candidate - match
            match = candidates[0]
            result['tconst'] = match['tconst']
            result['imdb_title'] = match['imdb_title_used']
            result['imdb_year'] = match['year']
            result['match_type'] = 'title_only_unique'
            results.append(result)
            match_type_counts['title_only_unique'] += 1
            continue

        # Multiple candidates - use year to disambiguate
        if pd.notna(ml_year):
            # Try exact year match first
            year_matches = [c for c in candidates if c['year'] == ml_year]
            if year_matches:
                match = year_matches[0]  # Take first if multiple
                result['tconst'] = match['tconst']
                result['imdb_title'] = match['imdb_title_used']
                result['imdb_year'] = match['year']
                result['match_type'] = 'title_year_exact'
                results.append(result)
                match_type_counts['title_year_exact'] += 1
                continue

            # Try closest year (only if difference <= 1)
            candidates_with_year = [c for c in candidates if c['year'] is not None]
            if candidates_with_year:
                best_match = min(candidates_with_year, key=lambda c: abs(c['year'] - ml_year))
                year_diff = abs(best_match['year'] - ml_year)
                if year_diff <= 1:
                    result['tconst'] = best_match['tconst']
                    result['imdb_title'] = best_match['imdb_title_used']
                    result['imdb_year'] = best_match['year']
                    result['match_type'] = 'title_year_closest'
                    results.append(result)
                    match_type_counts['title_year_closest'] += 1
                    continue

        # Multiple candidates but can't disambiguate
        result['match_type'] = 'ambiguous'
        results.append(result)
        match_type_counts['ambiguous'] += 1
        ambiguous_indices.append(idx)

    # Create DataFrame
    linkage_df = pd.DataFrame(results)

    # Print statistics
    print(f"\nMatching complete:")
    print(f"  Total movies: {len(linkage_df):,}")
    print(f"  Match type distribution:")
    for match_type, count in sorted(match_type_counts.items(), key=lambda x: -x[1]):
        pct = count / len(linkage_df) * 100
        print(f"    {match_type}: {count:,} ({pct:.1f}%)")

    matched_count = len(linkage_df[linkage_df['tconst'].notna()])
    coverage_pct = matched_count / len(linkage_df) * 100
    print(f"\n  Matched movies: {matched_count:,}")
    print(f"  Coverage: {coverage_pct:.2f}%")

    # Collect random samples for debugging (20 unmatched, 20 ambiguous)
    np.random.seed(RANDOM_SEED)
    if len(unmatched_indices) > 0:
        sample_unmatched_idx = np.random.choice(unmatched_indices, size=min(20, len(unmatched_indices)), replace=False)
        for idx in sample_unmatched_idx:
            ml_row = ml_movies.loc[idx]
            unmatched_samples.append((
                ml_row['movieId'],
                ml_row['title'],
                ml_row['ml_title_clean'],
                ml_row['ml_year']
            ))

    if len(ambiguous_indices) > 0:
        sample_ambiguous_idx = np.random.choice(ambiguous_indices, size=min(20, len(ambiguous_indices)), replace=False)
        for idx in sample_ambiguous_idx:
            ml_row = ml_movies.loc[idx]
            norm_title = ml_row['ml_title_norm']
            candidates = imdb_index.get(norm_title, [])
            candidate_years = [c['year'] for c in candidates if c['year'] is not None]
            ambiguous_samples.append((
                ml_row['movieId'],
                ml_row['title'],
                ml_row['ml_title_clean'],
                ml_row['ml_year'],
                candidate_years
            ))

    return linkage_df, match_type_counts, unmatched_samples, ambiguous_samples

# Match movies
linkage_df, match_type_counts, unmatched_samples, ambiguous_samples = match_movielens_to_imdb(ml_movies, imdb_index)

Matching Movies


Matching: 100%|██████████| 27278/27278 [25:38<00:00, 17.73it/s]


Matching complete:
  Total movies: 27,278
  Match type distribution:
    title_only_unique: 11,622 (42.6%)
    unmatched: 8,041 (29.5%)
    title_year_exact: 6,836 (25.1%)
    ambiguous: 484 (1.8%)
    title_year_closest: 295 (1.1%)

  Matched movies: 18,753
  Coverage: 68.75%





In [40]:
# Print debugging information
print("=" * 60)
print("DEBUGGING: Random Unmatched Samples")
print("=" * 60)
print(f"Showing {len(unmatched_samples)} random unmatched titles:")
for i, (movie_id, raw_title, processed_title, ml_year) in enumerate(unmatched_samples, 1):
    print(f"  {i}. movieId: {movie_id}, Raw: '{raw_title}'")
    print(f"      Processed: '{processed_title}', Year: {ml_year}")

print("\n" + "=" * 60)
print("DEBUGGING: Random Ambiguous Samples")
print("=" * 60)
print(f"Showing {len(ambiguous_samples)} random ambiguous cases:")
for i, (movie_id, raw_title, processed_title, ml_year, candidate_years) in enumerate(ambiguous_samples, 1):
    print(f"  {i}. movieId: {movie_id}, Raw: '{raw_title}'")
    print(f"      Processed: '{processed_title}', ML Year: {ml_year}, Candidate Years: {candidate_years}")

# Helper function: Save outputs
def save_outputs(linkage_df, match_type_counts, output_dir):
    """
    Save linkage results to CSV and summary to JSON.

    Args:
        linkage_df: DataFrame with linkage results
        match_type_counts: Dictionary with match type counts
        output_dir: Output directory path
    """
    print("\n" + "=" * 60)
    print("Saving Outputs")
    print("=" * 60)

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Save linkage CSV
    linkage_file = output_path / 'movielens_imdb_linkage.csv'
    linkage_df.to_csv(linkage_file, index=False)
    print(f"Saved: {linkage_file}")
    print(f"  Shape: {linkage_df.shape}")

    # Calculate summary statistics
    total_movies = len(linkage_df)
    matched_movies = len(linkage_df[linkage_df['tconst'].notna()])
    coverage_pct = matched_movies / total_movies * 100 if total_movies > 0 else 0.0

    # Create summary dictionary
    summary = {
        'total_movies': int(total_movies),
        'matched_movies': int(matched_movies),
        'coverage_pct': float(coverage_pct),
        'counts_by_match_type': {k: int(v) for k, v in match_type_counts.items()}
    }

    # Save summary JSON
    summary_file = output_path / 'linkage_summary.json'
    with open(summary_file, 'w') as f:
        json.dump(summary, f, indent=2)
    print(f"Saved: {summary_file}")

    print("\nSummary:")
    print(f"  Total movies: {summary['total_movies']:,}")
    print(f"  Matched movies: {summary['matched_movies']:,}")
    print(f"  Coverage: {summary['coverage_pct']:.2f}%")

    return summary

# Save outputs
summary = save_outputs(linkage_df, match_type_counts, OUTPUT_DIR)

DEBUGGING: Random Unmatched Samples
Showing 20 random unmatched titles:
  1. movieId: 41025, Raw: 'Good Woman, A (2004)'
      Processed: 'Good Woman, A', Year: 2004
  2. movieId: 3005, Raw: 'Bone Collector, The (1999)'
      Processed: 'Bone Collector, The', Year: 1999
  3. movieId: 56945, Raw: 'Perfect Holiday, The (2007)'
      Processed: 'Perfect Holiday, The', Year: 2007
  4. movieId: 116030, Raw: 'Children of the Corn 666: Isaac's Return (1999)'
      Processed: 'Children of the Corn 666: Isaac's Return', Year: 1999
  5. movieId: 5565, Raw: 'Dogwalker, The (2002)'
      Processed: 'Dogwalker, The', Year: 2002
  6. movieId: 96430, Raw: 'Odd Life of Timothy Green, The (2012)'
      Processed: 'Odd Life of Timothy Green, The', Year: 2012
  7. movieId: 95756, Raw: 'Cat in the Hat, The (1971)'
      Processed: 'Cat in the Hat, The', Year: 1971
  8. movieId: 1997, Raw: 'Exorcist, The (1973)'
      Processed: 'Exorcist, The', Year: 1973
  9. movieId: 121879, Raw: 'Method to the Madness 

In [41]:
# Print final summary and sample matches
print("=" * 60)
print("MOVIELENS-IMDB LINKAGE SUMMARY")
print("=" * 60)

print(f"\nCoverage Statistics:")
print(f"  Total MovieLens movies: {summary['total_movies']:,}")
print(f"  Successfully matched: {summary['matched_movies']:,}")
print(f"  Coverage: {summary['coverage_pct']:.2f}%")

print(f"\nMatch Type Breakdown:")
for match_type, count in sorted(summary['counts_by_match_type'].items(), key=lambda x: -x[1]):
    pct = count / summary['total_movies'] * 100
    print(f"  {match_type}: {count:,} ({pct:.1f}%)")

print(f"\nOutput Files:")
print(f"  {OUTPUT_DIR}/movielens_imdb_linkage.csv")
print(f"  {OUTPUT_DIR}/linkage_summary.json")

print(f"\nSample Matches:")
sample_matched = linkage_df[linkage_df['tconst'].notna()].head(10)
print(sample_matched[['movieId', 'ml_title_raw', 'ml_year', 'tconst', 'imdb_title', 'imdb_year', 'match_type']].to_string(index=False))

print("\n" + "=" * 60)
print("Linkage Complete!")
print("=" * 60)

MOVIELENS-IMDB LINKAGE SUMMARY

Coverage Statistics:
  Total MovieLens movies: 27,278
  Successfully matched: 18,753
  Coverage: 68.75%

Match Type Breakdown:
  title_only_unique: 11,622 (42.6%)
  unmatched: 8,041 (29.5%)
  title_year_exact: 6,836 (25.1%)
  ambiguous: 484 (1.8%)
  title_year_closest: 295 (1.1%)

Output Files:
  /content/drive/MyDrive/ml_imdb_linkage/movielens_imdb_linkage.csv
  /content/drive/MyDrive/ml_imdb_linkage/linkage_summary.json

Sample Matches:
 movieId                       ml_title_raw  ml_year    tconst                  imdb_title  imdb_year        match_type
       1                   Toy Story (1995)   1995.0 tt0114709                   Toy Story     1995.0 title_only_unique
       2                     Jumanji (1995)   1995.0 tt0113497                     Jumanji     1995.0 title_only_unique
       3            Grumpier Old Men (1995)   1995.0 tt0113228            Grumpier Old Men     1995.0 title_only_unique
       4           Waiting to Exhale (1995)  

In [42]:
# Helper function: Save embeddings and metadata
def save_artifacts(model, triples_factory, training, validation, testing, test_metrics, output_dir):
    """
    Save model embeddings, mappings, and metadata to disk.

    Args:
        model: Trained PyKEEN model
        triples_factory: TriplesFactory used for training
        training_result: Training result object
        testing: Test TriplesFactory
        test_metrics: Test set evaluation metrics
        output_dir: Output directory path
    """
    print("=" * 60)
    print("Saving Artifacts")
    print("=" * 60)

    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # Extract embeddings
    entity_embeddings = model.entity_representations[0](indices=None).detach().cpu().numpy()
    relation_embeddings = model.relation_representations[0](indices=None).detach().cpu().numpy()

    print(f"Entity embeddings shape: {entity_embeddings.shape}")
    print(f"Relation embeddings shape: {relation_embeddings.shape}")

    # Save embeddings as numpy arrays
    np.save(output_path / 'entity_embeddings.npy', entity_embeddings)
    print(f"  Saved: {output_path / 'entity_embeddings.npy'}")

    np.save(output_path / 'relation_embeddings.npy', relation_embeddings)
    print(f"  Saved: {output_path / 'relation_embeddings.npy'}")

    # Save entity and relation mappings
    entity_to_id = {entity: int(idx) for entity, idx in triples_factory.entity_to_id.items()}
    relation_to_id = {relation: int(idx) for relation, idx in triples_factory.relation_to_id.items()}

    with open(output_path / 'entity_to_id.json', 'w') as f:
        json.dump(entity_to_id, f, indent=2)
    print(f"  Saved: {output_path / 'entity_to_id.json'}")

    with open(output_path / 'relation_to_id.json', 'w') as f:
        json.dump(relation_to_id, f, indent=2)
    print(f"  Saved: {output_path / 'relation_to_id.json'}")

    # Create metadata dictionary
    metadata = {
        'model': 'TransE',
        'embedding_dim': EMBEDDING_DIM,
        'num_epochs': NUM_EPOCHS,
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'margin': MARGIN,
        'random_seed': RANDOM_SEED,
        'device': DEVICE,
        'dataset_sizes': {
            'total_triples': int(triples_factory.num_triples),
            'train_triples': int(training.num_triples),
            'valid_triples': int(validation.num_triples),
            'test_triples': int(testing.num_triples),
        },
        'num_entities': int(triples_factory.num_entities),
        'num_relations': int(triples_factory.num_relations),
        'evaluation_metrics': {k: float(v) if isinstance(v, (int, float, np.number)) else str(v)
                               for k, v in test_metrics.items()},
        'training_loss': float(training_result.losses[-1]) if hasattr(training_result, 'losses') and training_result.losses else None,
    }

    # Save metadata
    with open(output_path / 'metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"  Saved: {output_path / 'metadata.json'}")

    # Try to save model checkpoint if available
    try:
        checkpoint_path = output_path / 'model_checkpoint.pt'
        torch.save({
            'model_state_dict': model.state_dict(),
            'model_kwargs': {'embedding_dim': EMBEDDING_DIM},
            'entity_to_id': entity_to_id,
            'relation_to_id': relation_to_id,
        }, checkpoint_path)
        print(f"  Saved: {checkpoint_path}")
    except Exception as e:
        print(f"  Warning: Could not save model checkpoint: {e}")

    print("\nAll artifacts saved successfully!")
    return metadata

# Save all artifacts
metadata = save_artifacts(model, triples_factory, training, validation, testing, test_metrics, OUTPUT_DIR)

Saving Artifacts
Entity embeddings shape: (1401319, 128)
Relation embeddings shape: (4, 128)
  Saved: /content/drive/MyDrive/ml_imdb_linkage/entity_embeddings.npy
  Saved: /content/drive/MyDrive/ml_imdb_linkage/relation_embeddings.npy
  Saved: /content/drive/MyDrive/ml_imdb_linkage/entity_to_id.json
  Saved: /content/drive/MyDrive/ml_imdb_linkage/relation_to_id.json
  Saved: /content/drive/MyDrive/ml_imdb_linkage/metadata.json
  Saved: /content/drive/MyDrive/ml_imdb_linkage/model_checkpoint.pt

All artifacts saved successfully!


In [43]:
# Print final summary
print("=" * 60)
print("TRANSE TRAINING SUMMARY")
print("=" * 60)

print(f"\nModel Configuration:")
print(f"  Model: TransE")
print(f"  Embedding dimension: {EMBEDDING_DIM}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {LEARNING_RATE}")

print(f"\nDataset:")
print(f"  Total entities: {triples_factory.num_entities:,}")
print(f"  Total relations: {triples_factory.num_relations}")
print(f"  Total triples: {triples_factory.num_triples:,}")
print(f"  Train: {training.num_triples:,} ({training.num_triples/triples_factory.num_triples:.1%})")
print(f"  Valid: {validation.num_triples:,} ({validation.num_triples/triples_factory.num_triples:.1%})")
print(f"  Test: {testing.num_triples:,} ({testing.num_triples/triples_factory.num_triples:.1%})")

print(f"\nTest Set Evaluation:")
for metric_name, metric_value in test_metrics.items():
    if isinstance(metric_value, (int, float, np.number)):
        print(f"  {metric_name}: {metric_value:.4f}")

print(f"\nSaved Artifacts:")
print(f"  {OUTPUT_DIR}/entity_embeddings.npy")
print(f"  {OUTPUT_DIR}/relation_embeddings.npy")
print(f"  {OUTPUT_DIR}/entity_to_id.json")
print(f"  {OUTPUT_DIR}/relation_to_id.json")
print(f"  {OUTPUT_DIR}/metadata.json")

print("\n" + "=" * 60)
print("TransE Training Complete!")
print("=" * 60)

TRANSE TRAINING SUMMARY

Model Configuration:
  Model: TransE
  Embedding dimension: 128
  Epochs: 30
  Batch size: 512
  Learning rate: 0.001

Dataset:
  Total entities: 1,401,319
  Total relations: 4
  Total triples: 3,238,794
  Train: 2,591,035 (80.0%)
  Valid: 323,879 (10.0%)
  Test: 323,880 (10.0%)

Test Set Evaluation:

Saved Artifacts:
  /content/drive/MyDrive/ml_imdb_linkage/entity_embeddings.npy
  /content/drive/MyDrive/ml_imdb_linkage/relation_embeddings.npy
  /content/drive/MyDrive/ml_imdb_linkage/entity_to_id.json
  /content/drive/MyDrive/ml_imdb_linkage/relation_to_id.json
  /content/drive/MyDrive/ml_imdb_linkage/metadata.json

TransE Training Complete!


## movieId → kg_embedding