### TF-IDF Baseline: Regression on merged_train. We build a simple baseline using TF-IDF features and Ridge regression to predict `production_date` (year). We'll evaluate on a validation split and then predict missing years to compare later with BERT.


In [None]:
# IMPORT ALL THE NEEDED FILES AND LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sys
from pathlib import Path
from IPython.display import display, HTML
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import pipeline,BertTokenizer, TrainingArguments, Trainer, BertForSequenceClassification
from scripts.helpers import *
from scripts.loading import *
from scripts.formating import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'scripts.helpers'

In [None]:
raw_dataset = load_wellcome_data(n_samples=100000)ce

In [None]:
threshold = 0.80
missing_pct = raw_dataset.isnull().sum() / len(raw_dataset)

# columns under the threshold
cols_to_keep = missing_pct[missing_pct <= threshold].index.tolist()

# we get rid of every column with more than treshold% of NaNs except description because of its relevance
if "description" in raw_dataset.columns and "description" not in cols_to_keep:
    cols_to_keep.append("description")

filtered = raw_dataset[cols_to_keep]

In [None]:
cols = filtered.columns.to_list()
print(f'We now have {len(cols)} columns:')
print(*cols,sep='\n')

In [None]:
# shape of the dataset
print(f"\nOur filtered dataset has: {filtered.shape[1]} columns and {filtered.shape[0]} rows.")

# how many missing values per column
missing_values = filtered.isnull().sum()

plt.figure(figsize=(12,6))
plt.title("Missing Values per Column")
missing_values.plot(kind='bar')
plt.ylabel("Number of Missing Values")
plt.xlabel("Columns")
plt.show();

print_one_random_record(filtered)

In [None]:
# Remove non-predictable columns and with no corelation to the target
columns_to_remove = [
    # System IDs - just internal database identifiers, not predictable
    'id', 'workType_id', 'contributor_ids', 'subject_ids', 'language_ids', 
    'identifiers', 'sierra_system_number',
    
    # Counts - database metadata, not intrinsic properties of the work
    'items_count', 'parts_count', 'holdings_count', 'images_count',
    
    # Operational data - library-specific, not about the work itself
    'has_digitized_items', 'availability_status',
    
    # Less useful duplicates - keep the main version, remove auxiliary
    'note_types'  # Keep 'notes', remove the types since notes contain more info
]
necessary_df = filtered.copy().drop(columns=columns_to_remove);

# Display percentage of missing values per column with a custom styled table
null_pct = necessary_df.isnull().mean().sort_values(ascending=False) * 100
train_null = pd.DataFrame({
    "Column": null_pct.index,
    "% Null": [f"{v:.2f}%" for v in null_pct.values]
})

css = """
<style> 
.table-fixed {border-collapse: collapse; width: 70%; max-width: 900px;} 
.table-fixed th, 
.table-fixed td {border: 1px solid #ddd; padding: 6px 10px; text-align: left;} 
.table-fixed th {background:#808080; font-weight:600;} 
</style>
"""
print('We finally have only those columns left')
display(HTML(css + train_null.to_html(index=False, classes="table-fixed", escape=False)))

In [None]:
print_one_random_record(necessary_df)

In [None]:
train_test_ratio = 0.8
train = necessary_df.sample(frac = train_test_ratio, random_state= 42)
test = necessary_df.drop(train.index)
print(f"We decide to use {train_test_ratio*100}% of the data for the training.\n"
      f"Therefore we have a training sample of size {train.shape} and the test one of size {test.shape}")

In [None]:
cumsum = train['production_places'].value_counts(normalize=True).cumsum()
print(f"We need the top {(cumsum <= 0.80).sum()} of the values to span 80% of all the production_places inside the dataset")
cumsum = train['languages'].value_counts(normalize=True).cumsum()
print(f"We need the top {(cumsum <= 0.95).sum()} of the values to span 95% of all the languages inside the dataset")
cumsum = train['workType'].value_counts(normalize=True).cumsum()
print(f"We need the top {(cumsum <= 0.95).sum()} of the values to span 95% of all the workType inside the dataset")


In [None]:
for col in train.columns:
    types = train[col].apply(lambda x: type(x).__name__).unique()
    print(col, "->", types)

In [None]:
BIN_SIZE = 30  # Size of year bins (e.g., 30 = bins of 1990-2019, 2020-2049, etc.)
MODEL_TYPE = 'logistic'  # Options: 'logistic' or 'random_forest'
USE_NUMERICAL_FEATURES = False  # Set to True to add numerical features to TF-

MODEL_SAVE_PATH = "./bin_classification_model.pkl"
VECTORIZER_SAVE_PATH = "./bin_classification_vectorizer.pkl"
SCALER_SAVE_PATH = "./bin_classification_scaler.pkl"

print(f"\nTraining Data:")
print(f"  Shape: {merged_train.shape}")
print(f"  Columns: {list(merged_train.columns)}")

print(f"\nTest Data:")
print(f"  Shape: {test.shape}")
print(f"  Columns: {list(test.columns)}")

In [None]:

def format_row(row):
    """
    Format row to match training data format: [COLUMN: {value}]
    """
    parts = []
    for col in row.index:
        col_upper = col.upper()
        value = row[col]
        if pd.isna(value):
            value = ""
        parts.append(f"[{col_upper}: {{{value}}}]")
    return " ".join(parts)


def create_merged_column_matching_train(df):
    """
    Create MERGED column using the same format as training data
    """
    print("\nCreating MERGED column with format: [COLUMN: {value}]...")
    
    # Exclude production_date and thumbnail_url (if exists)
    columns_to_merge = [c for c in df.columns 
                       if c not in ["production_date", "thumbnail_url", "year_bin"]]
    
    print(f"  Columns to merge: {columns_to_merge}")
    
    # Create a dataframe with only columns to merge
    df_for_merge = df[columns_to_merge].copy()
    
    # Apply formatting
    merged_text = df_for_merge.apply(format_row, axis=1)
    
    # Create result dataframe
    result = pd.DataFrame()
    result["MERGED"] = merged_text
    result["production_date"] = pd.to_numeric(df["production_date"], errors="coerce")
    
    # Keep year_bin if it exists
    if "year_bin" in df.columns:
        result["year_bin"] = df["year_bin"]
    
    result = result.reset_index(drop=True)
    
    print(f"MERGED column created")
    print(f" Result shape: {result.shape}")
    
    # Show sample
    if len(result) > 0:
        print(f"\n  Sample MERGED text (first 200 chars):")
        print(f"  {result['MERGED'].iloc[0][:200]}...")
    
    return result


if 'MERGED' not in test.columns:
    print("\nMERGED column not found in test data")
    test_with_merged = create_merged_column_matching_train(test)
    
    # Replace test with the new version
    test = test_with_merged
    
    print(f"  Final test shape: {test.shape}")
    print(f"  Columns: {list(test.columns)}")
else:
    pass


In [None]:

def preprocess_dataset(df, is_train=True):
    print(f"\nPreprocessing {'training' if is_train else 'test'} data...")
    df = df.copy()
    
    if 'production_date' in df.columns:
        missing_count = df['production_date'].isna().sum()
        if missing_count > 0:
            median_year = df['production_date'].median()
            df['production_date'] = df['production_date'].fillna(median_year)
            print(f"Filled {missing_count} missing dates with median: {median_year}")
    
    if 'production_date' in df.columns:
        before = len(df)
        df = df[(df['production_date'] >= 1465) & (df['production_date'] <= 2025)]
        removed = before - len(df)
        if removed > 0:
            print(f"Filtered to years 1465-2025: removed {removed} rows ({removed/before*100:.2f}%)")
        print(f"Year range: {df['production_date'].min():.0f} - {df['production_date'].max():.0f}")
    
    if 'MERGED' in df.columns:
        df['text_length'] = df['MERGED'].str.len()
        df['word_count'] = df['MERGED'].str.split().str.len()
        df['avg_word_length'] = df['text_length'] / (df['word_count'] + 1)
        print(f" Created text features")
    
    return df


train_processed = preprocess_dataset(merged_train, is_train=True)
test_processed = preprocess_dataset(test, is_train=False)

print(f"Train shape: {train_processed.shape}")
print(f"Test shape: {test_processed.shape}")


In [None]:

def create_bins(df, bin_size):
    """Create year bins"""
    df = df.copy()
    df['year_bin'] = (df['production_date'] // bin_size) * bin_size
    df['year_bin_label'] = (df['year_bin'].astype(int).astype(str) + '-' + 
                           (df['year_bin'] + bin_size - 1).astype(int).astype(str))
    return df

train_binned = create_bins(train_processed, BIN_SIZE)
test_binned = create_bins(test_processed, BIN_SIZE)

print(f"\nBin size: {BIN_SIZE} years")
print(f"\nTraining data:")
print(f"Samples: {len(train_binned)}")
print(f"Unique bins: {train_binned['year_bin'].nunique()}")
print(f"Year range: {train_binned['production_date'].min():.0f} - {train_binned['production_date'].max():.0f}")

print(f"\nTest data:")
print(f"Samples: {len(test_binned)}")
print(f"Unique bins: {test_binned['year_bin'].nunique()}")
print(f"Year range: {test_binned['production_date'].min():.0f} - {test_binned['production_date'].max():.0f}")

print(f"\nTop 10 bins (training):")
print(train_binned['year_bin_label'].value_counts().head(10))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=3,
    max_df=0.8,
    ngram_range=(1, 2),
    strip_accents='unicode',
    lowercase=True,
    stop_words='english',
    sublinear_tf=True
)

print("\nFitting TF-IDF on training data...")
X_train_text = vectorizer.fit_transform(train_binned['MERGED'].fillna(''))

print("Transforming test data...")
X_test_text = vectorizer.transform(test_binned['MERGED'].fillna(''))

print(f"\nâœ“ TF-IDF complete!")
print(f"  Train shape: {X_train_text.shape}")
print(f"  Test shape: {X_test_text.shape}")
print(f"  Vocabulary: {len(vectorizer.vocabulary_)} features")
print("="*70)

In [None]:

print("STEP: PREPARE FEATURES")
if USE_NUMERICAL_FEATURES:
    print("\nAdding numerical features...")
    numerical_features = ['text_length', 'word_count', 'avg_word_length']
    
    scaler = StandardScaler()
    X_train_numerical = scaler.fit_transform(train_binned[numerical_features].fillna(0))
    X_test_numerical = scaler.transform(test_binned[numerical_features].fillna(0))
    
    from scipy.sparse import hstack
    X_train = hstack([X_train_text, X_train_numerical])
    X_test = hstack([X_test_text, X_test_numerical])
    
    print(f"  âœ“ Combined features, shape: {X_train.shape}")
else:
    print("\nUsing TF-IDF only")
    X_train = X_train_text
    X_test = X_test_text
    scaler = None

y_train = train_binned['year_bin'].values
y_test = test_binned['year_bin'].values

print(f"\nFeatures ready:")
print(f"  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")
print(f"  Classes: {len(np.unique(y_train))}")
print("="*70)

In [None]:


if MODEL_TYPE == 'logistic':
    model = LogisticRegression(
        max_iter=1000,
        random_state=42,
        n_jobs=-1,
        C=1.0,
        solver='lbfgs',
        verbose=1
    )
elif MODEL_TYPE == 'random_forest':
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

print(f"\nðŸ”„ Training {MODEL_TYPE} model...")
model.fit(X_train, y_train)
print("âœ“ Training complete!")

train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_pred)
print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print("="*70)

In [None]:
# ============================================================================
# CELL 10: SAVE MODEL
# ============================================================================

print("="*70)
print("STEP: SAVE MODEL")
print("="*70)

model_info = {
    'model': model,
    'model_type': MODEL_TYPE,
    'bin_size': BIN_SIZE,
    'use_numerical': USE_NUMERICAL_FEATURES
}

with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump(model_info, f)
print(f"âœ“ Model saved: {MODEL_SAVE_PATH}")

with open(VECTORIZER_SAVE_PATH, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"âœ“ Vectorizer saved: {VECTORIZER_SAVE_PATH}")

if USE_NUMERICAL_FEATURES and scaler:
    with open(SCALER_SAVE_PATH, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"âœ“ Scaler saved: {SCALER_SAVE_PATH}")

print("="*70)

In [None]:
# ============================================================================
# CELL 11: PREDICT
# ============================================================================

print("="*70)
print("STEP: PREDICT")
print("="*70)

print("\nðŸ”„ Making predictions...")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

test_binned['predicted_year_bin'] = y_pred
test_binned['predicted_year_bin_label'] = (y_pred.astype(int).astype(str) + '-' + 
                                           (y_pred + BIN_SIZE - 1).astype(int).astype(str))
test_binned['prediction_confidence'] = y_pred_proba.max(axis=1)
test_binned['predicted_year'] = y_pred + (BIN_SIZE / 2)
test_binned['error_years'] = abs(test_binned['production_date'] - test_binned['predicted_year'])

print(f"âœ“ Predictions complete: {len(y_pred)} samples")
print("="*70)

In [None]:
# Classification metrics
test_accuracy = accuracy_score(y_test, y_pred)
exact_match = (y_test == y_pred).sum()
within_one = (np.abs(y_test - y_pred) <= BIN_SIZE).sum()

print(f"\nClassification Metrics:")
print(f"  Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"  Exact match: {exact_match}/{len(y_test)} ({exact_match/len(y_test)*100:.2f}%)")
print(f"  Within Â±{BIN_SIZE}y: {within_one}/{len(y_test)} ({within_one/len(y_test)*100:.2f}%)")

# Regression metrics
mae = mean_absolute_error(test_binned['production_date'], test_binned['predicted_year'])
rmse = np.sqrt(mean_squared_error(test_binned['production_date'], test_binned['predicted_year']))
r2 = r2_score(test_binned['production_date'], test_binned['predicted_year'])

print(f"\nRegression Metrics:")
print(f"  MAE: {mae:.2f} years")
print(f"  RMSE: {rmse:.2f} years")
print(f"  RÂ²: {r2:.4f}")

print(f"\nConfidence:")
print(f"  Mean: {test_binned['prediction_confidence'].mean():.3f}")
print(f"  High (>0.8): {(test_binned['prediction_confidence'] > 0.8).sum()} ({(test_binned['prediction_confidence'] > 0.8).sum()/len(test_binned)*100:.1f}%)")


In [None]:
# Evaluation metrics and visualizations
print("="*70)
print("EVALUATION METRICS")
print("="*70)

# Calculate metrics
test_accuracy = accuracy_score(y_test, y_pred)
exact_match = (y_test == y_pred).sum()
within_bin = (np.abs(y_test - y_pred) <= BIN_SIZE).sum()

mae = mean_absolute_error(test_binned['production_date'], test_binned['predicted_year'])
rmse = np.sqrt(mean_squared_error(test_binned['production_date'], test_binned['predicted_year']))
r2 = r2_score(test_binned['production_date'], test_binned['predicted_year'])

print(f"\nClassification Metrics:")
print(f"  Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"  Exact match: {exact_match}/{len(y_test)} ({exact_match/len(y_test)*100:.2f}%)")
print(f"  Within Â±{BIN_SIZE}y: {within_bin}/{len(y_test)} ({within_bin/len(y_test)*100:.2f}%)")

print(f"\nRegression Metrics:")
print(f"  MAE: {mae:.2f} years")
print(f"  RMSE: {rmse:.2f} years")
print(f"  RÂ²: {r2:.4f}")

print(f"\nConfidence:")
print(f"  Mean: {test_binned['prediction_confidence'].mean():.3f}")
print(f"  High (>0.8): {(test_binned['prediction_confidence'] > 0.8).sum()} ({(test_binned['prediction_confidence'] > 0.8).sum()/len(test_binned)*100:.1f}%)")

print("="*70)

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Actual vs Predicted
axes[0, 0].scatter(test_binned['production_date'], test_binned['predicted_year'], alpha=0.3, s=10)
axes[0, 0].plot([test_binned['production_date'].min(), test_binned['production_date'].max()],
                [test_binned['production_date'].min(), test_binned['production_date'].max()],
                'r--', lw=2, label='Perfect prediction')
axes[0, 0].set_xlabel('Actual Year')
axes[0, 0].set_ylabel('Predicted Year')
axes[0, 0].set_title('Actual vs Predicted Years')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Error distribution
axes[0, 1].hist(test_binned['error_years'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].axvline(mae, color='r', linestyle='--', linewidth=2, label=f'MAE: {mae:.2f}')
axes[0, 1].set_xlabel('Absolute Error (years)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Prediction Errors')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Confidence distribution
axes[1, 0].hist(test_binned['prediction_confidence'], bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_xlabel('Confidence Score')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Prediction Confidence')
axes[1, 0].grid(True, alpha=0.3)

# 4. Bin distribution
bin_counts = test_binned['year_bin_label'].value_counts().sort_index()
axes[1, 1].bar(range(len(bin_counts)), bin_counts.values, alpha=0.7, color='orange')
axes[1, 1].set_xticks(range(len(bin_counts)))
axes[1, 1].set_xticklabels(bin_counts.index, rotation=45, ha='right')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Distribution of Predictions by Year Bin')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"\nTop predictions by confidence:")
print(test_binned[['production_date', 'predicted_year', 'prediction_confidence', 'error_years']].nlargest(10, 'prediction_confidence'))

In [None]:
year_numeric = pd.to_numeric(train["production_date"], errors="coerce")
oldest_year = int(year_numeric.min() > 1000)
print(f"The oldest production year in the training set is: {oldest_year}")
year_filtered = year_numeric[year_numeric.between(1465, 2025)]
print(year_filtered.max())
plt.figure(figsize=(10, 5))
sns.histplot(year_filtered.dropna(), bins=80, kde=True, color="steelblue")
plt.title(f"Production year distribution (1465 to 2025)")
plt.xlabel("Year")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

## REGRESSION

In [None]:

print("TF-IDF REGRESSION PIPELINE - EXACT YEAR PREDICTION")


MIN_YEAR = 1465
MAX_YEAR = 2025
MODEL_SAVE_PATH = "./tfidf_regression_model.pkl"
VECTORIZER_SAVE_PATH = "./tfidf_regression_vectorizer.pkl"

print(f"Year range: {MIN_YEAR}-{MAX_YEAR}")


In [None]:

print("STEP: PREPROCESS FOR REGRESSION")

def preprocess_for_regression(df, min_year=1465, max_year=2025):
    """Filter years and prepare for regression"""
    df = df.copy()
    
    # Handle missing dates
    missing = df['production_date'].isna().sum()
    if missing > 0:
        median = df['production_date'].median()
        df['production_date'] = df['production_date'].fillna(median)
        print(f"  Filled {missing} missing dates with median: {median:.0f}")
    
    # Filter year range
    before = len(df)
    df = df[(df['production_date'] >= min_year) & (df['production_date'] <= max_year)]
    removed = before - len(df)
    print(f"  Filtered {min_year}-{max_year}: removed {removed} rows ({removed/before*100:.2f}%)")
    print(f"  Final: {len(df)} samples, years {df['production_date'].min():.0f}-{df['production_date'].max():.0f}")
    
    return df

train_reg = preprocess_for_regression(merged_train, MIN_YEAR, MAX_YEAR)
test_reg = preprocess_for_regression(test, MIN_YEAR, MAX_YEAR)

print(f"\nâœ“ Train: {train_reg.shape}, Test: {test_reg.shape}")


In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=3,
    max_df=0.8,
    ngram_range=(1, 2),
    stop_words='english',
    sublinear_tf=True
)

print("Fitting TF-IDF...")
X_train = vectorizer.fit_transform(train_reg['MERGED'].fillna(''))
X_test = vectorizer.transform(test_reg['MERGED'].fillna(''))

y_train = train_reg['production_date'].values
y_test = test_reg['production_date'].values

print(f"âœ“ Train: {X_train.shape}, Test: {X_test.shape}")
print("="*70)

In [None]:
# ============================================================================
# CELL 4: TRAIN REGRESSION MODEL
# ============================================================================

from sklearn.linear_model import Ridge



model = Ridge(alpha=1.0, random_state=42)

print("Training...")
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
train_mae = np.abs(train_pred - y_train).mean()

print(f"âœ“ Training MAE: {train_mae:.2f} years")


In [None]:

import pickle



with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump(model, f)
print(f"âœ“ Model: {MODEL_SAVE_PATH}")

with open(VECTORIZER_SAVE_PATH, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"âœ“ Vectorizer: {VECTORIZER_SAVE_PATH}")


In [None]:


y_pred = model.predict(X_test)

test_reg['predicted_year_regression'] = y_pred
test_reg['error_regression'] = np.abs(test_reg['production_date'] - y_pred)

print(f"âœ“ Predictions: {len(y_pred)} samples")


In [None]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
median_ae = np.median(np.abs(y_test - y_pred))

print(f"\nMetrics:")
print(f"  MAE:    {mae:.2f} years")
print(f"  Median: {median_ae:.2f} years")
print(f"  RMSE:   {rmse:.2f} years")
print(f"  RÂ²:     {r2:.4f}")

print(f"\nError Distribution:")
print(f"  <15y:  {(test_reg['error_regression'] < 15).sum()} ({(test_reg['error_regression'] < 15).mean()*100:.1f}%)")
print(f"  <30y:  {(test_reg['error_regression'] < 30).sum()} ({(test_reg['error_regression'] < 30).mean()*100:.1f}%)")
print(f"  <50y:  {(test_reg['error_regression'] < 50).sum()} ({(test_reg['error_regression'] < 50).mean()*100:.1f}%)")
print(f"  >100y: {(test_reg['error_regression'] > 100).sum()} ({(test_reg['error_regression'] > 100).mean()*100:.1f}%)")


In [None]:
# ============================================================================
# CELL 8: SAMPLE RESULTS
# ============================================================================

print("="*70)
print("SAMPLE PREDICTIONS")
print("="*70)

sample = test_reg[['MERGED', 'production_date', 'predicted_year_regression', 'error_regression']].head(15).copy()
sample['MERGED'] = sample['MERGED'].str[:70] + '...'
sample['predicted_year_regression'] = sample['predicted_year_regression'].round(1)
sample['error_regression'] = sample['error_regression'].round(1)

print("\n" + sample.to_string(index=False))
print("="*70)

In [None]:
print("\n=== TRAINING DATA DIAGNOSTICS ===")
print(merged_train['production_date'].describe())
print(f"NaN count: {merged_train['production_date'].isna().sum()}")
print(f"Data type: {merged_train['production_date'].dtype}")
print(f"Sample values: {merged_train['production_date'].head()}")