In [None]:
! pip install tabpfn #pip install "tabpfn @ git+https://github.com/PriorLabs/TabPFN.git"

# Setting Up Data Science Environment for Regression


This code imports essential tools for working with public datasets and evaluating regression models:

### Data Acquisition
- `fetch_openml`: Fetches datasets from the OpenML repository, which hosts thousands of public datasets
  - Unlike fixed scikit-learn datasets, this allows access to a wide variety of real-world data
  - Datasets can be specified by name or ID number
  - Example: `fetch_openml(name='boston', version=1)` or `fetch_openml(data_id=42)`

### Model Evaluation
- `mean_squared_error`: Calculates the average squared difference between predicted and actual values
  - Lower values indicate better model performance
  - Formula: MSE = (1/n) * Σ(y_true - y_pred)²
  - Units are squared units of the target variable

- `r2_score`: Coefficient of determination (R²)
  - Measures the proportion of variance in the dependent variable predictable from the independent variables
  - Range: 0 to 1, where 1 indicates perfect prediction
  - Can be negative if the model is worse than a horizontal line

### Data Splitting
- `train_test_split`: Divides datasets into random training and testing subsets
  - Essential for proper model validation
  - Typical usage: `X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)`
  - Parameters control split ratio, stratification, and randomization

These tools are commonly used together in a regression workflow to fetch datasets, split the data appropriately, and evaluate model performance using standard metrics.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


from sklearn.datasets import fetch_openml
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")

# Load the Boston Housing dataset

In [None]:
data_dict = fetch_openml(data_id=531, as_frame=True)  # Boston Housing dataset

# Get the feature names

In [None]:
feature_names=list(data_dict.data.keys())

# Create a DataFrame for easier exploration

In [None]:
data_dict = fetch_openml(data_id=531, as_frame=True) 

X = data_dict.data
y = data_dict.target.astype(float)  # Ensure target is float for regression
df = pd.DataFrame(X, columns=feature_names)
df['MEDV'] = y


# Basic dataset information

In [None]:
print("Dataset Shape:", df.shape)
print("\nFeature Names:")
for i, name in enumerate(feature_names):
    print(f"{i+1}. {name}")

print("\nFirst 5 rows:")
print(df.head())

# Statistical summary

In [None]:
# Statistical summary
print("\nStatistical Summary:")
print(df.describe().T)

# Check for missing values

In [None]:
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values")


# Target variable distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['MEDV'], kde=True, bins=30)
plt.title('Distribution of House Prices (MEDV)')
plt.xlabel('Median Value of Homes ($1000s)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)
plt.show()

# Print target statistics

In [None]:
print("\nTarget (MEDV) Statistics:")
print(df['MEDV'].describe())

# Correlations with target

In [None]:
correlations = df.corr()['MEDV'].sort_values(ascending=False)
print("\nCorrelations with Target (MEDV):")
print(correlations)

# Visualize correlations

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Top correlated features with target

In [None]:
top_corr_features = correlations.index[1:6]  # Excluding MEDV itself which has correlation 1.0
plt.figure(figsize=(15, 10))
for i, feature in enumerate(top_corr_features):
    plt.subplot(2, 3, i+1)
    plt.scatter(df[feature], df['MEDV'], alpha=0.6)
    plt.title(f'MEDV vs {feature} (corr: {correlations[feature]:.2f})')
    plt.xlabel(feature)
    plt.ylabel('MEDV')
    plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Pairplot of important features

In [None]:
important_features = list(top_corr_features) + ['MEDV']
plt.figure(figsize=(12, 10))
sns.pairplot(df[important_features], height=2.5)
plt.suptitle('Pairplot of Important Features', y=1.02)
plt.show()

# Check for Skewness

In [None]:
# 9. Check for Skewness
df_numeric = df.select_dtypes(include=['number'])
skewness = df_numeric.skew().sort_values(ascending=False)
print("\nSkewness of Features:")
print(skewness)

plt.figure(figsize=(12, 6))
skewness.plot(kind='bar')
plt.title('Skewness of Features')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Outlier detection

In [None]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(df_numeric.columns):
    if i >= 9:  # Limit to 9 features for readability
        break
    plt.subplot(3, 3, i+1)
    sns.boxplot(y=df[feature])
    plt.title(f'Boxplot of {feature}')
plt.tight_layout()
plt.show()

# Distribution of important feature variables

In [None]:
plt.figure(figsize=(15, 10))
for i, feature in enumerate(feature_names):
    if i >= 9:  # Limit to 9 features for readability
        break
    plt.subplot(3, 3, i+1)
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()


# Using TabPFN for Regression on Boston Housing Data

This code sets up a regression task using TabPFN on the Boston Housing dataset:

### TabPFN for Regression
- `TabPFNRegressor`: A specialized regression variant of TabPFN (Tabular Prior-Data Fitted Network)
  - Unlike the classifier version, this is designed for predicting continuous values
  - Leverages the same transformer-based architecture with pre-training on synthetic tabular data
  - Typically requires minimal hyperparameter tuning for good performance

### Data Loading
- `fetch_openml(data_id=531, as_frame=True)`: Retrieves the Boston Housing dataset from OpenML
  - `data_id=531`: Specifies the Boston Housing dataset by its unique identifier
  - `as_frame=True`: Returns the data as a pandas DataFrame instead of a numpy array
  - This dataset contains information about housing in Boston suburbs and is a classic benchmark for regression

### Data Preparation
- `X = df.data`: Extracts the feature matrix
  - Contains attributes like CRIM (crime rate), ZN (proportion of residential land), INDUS (proportion of non-retail business acres), etc.
- `y = df.target.astype(float)`: Extracts the target variable and ensures it's in float format
  - The target is MEDV (Median value of owner-occupied homes in $1000s)
  - Converting to float is important for regression tasks to ensure proper calculations

This setup prepares for applying TabPFN's regression capabilities to predict housing prices based on neighborhood characteristics.

In [None]:
from tabpfn import TabPFNRegressor  

# Load Boston Housing data
df = fetch_openml(data_id=531, as_frame=True)  # Boston Housing dataset
X = df.data
y = df.target.astype(float)  # Ensure target is float for regression

# Splitting Data for Regression Analysis


### Function Parameters
- `X`: Feature matrix containing housing attributes (like crime rate, number of rooms, etc.)
- `y`: Target vector containing housing prices (MEDV)
- `test_size=0.5`: Allocates 50% of the data for testing and 50% for training
  - This is a larger test set than typical (usually 20-30%)
  - A 50/50 split provides ample data for both training and thorough evaluation
- `random_state=42`: Sets a specific random seed for reproducibility
  - Ensures the same split will occur each time the code runs
  - The value 42 is commonly used (a reference to "The Hitchhiker's Guide to the Galaxy")

### Resulting Datasets
- `X_train`: Features for training the model (approximately 253 samples)
- `X_test`: Features for evaluating the model (approximately 253 samples)
- `y_train`: Housing prices for the training set
- `y_test`: Housing prices for the test set

### Purpose
This data split allows you to:
1. Train the TabPFN regressor on one subset of the data
2. Test its performance on unseen data to evaluate generalization
3. Get a realistic estimate of how the model would perform on new housing data

The equal split between training and testing provides a balanced assessment of the model's predictive capabilities for this regression task.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Training the TabPFN Regressor

This code initializes and trains a TabPFN regressor on the Boston Housing dataset:

### Model Initialization
- `TabPFNRegressor()`: Creates an instance of TabPFN's regression model with default parameters
  - Unlike traditional regression models, TabPFN comes pre-trained on synthetic tabular data
  - The default configuration typically works well without extensive hyperparameter tuning
  - Optional parameters include N_ensemble_configurations (number of models in ensemble) and device (CPU/GPU)

### Model Training
- `regressor.fit(X_train, y_train)`: Adapts the regressor to the housing price prediction task
  - `X_train`: Feature matrix containing ~253 samples with housing attributes
  - `y_train`: Target vector containing corresponding housing prices (MEDV values)
  - The training process is typically faster than traditional models as it leverages transfer learning
  - TabPFN adapts its pre-trained knowledge to the specific patterns in the Boston Housing dataset

### What Happens During Fitting
During the `fit()` operation, TabPFN:
1. Normalizes the input features internally
2. Adapts its pre-trained transformer architecture to the regression task
3. Optimizes for continuous value prediction rather than classification
4. May create an ensemble of models if configured to do so

TabPFN's approach is especially advantageous for tabular regression tasks like housing price prediction, often achieving competitive performance with minimal configuration and training time.

In [None]:
# Initialize the regressor
regressor = TabPFNRegressor()  
regressor.fit(X_train, y_train)

# Making Predictions with TabPFN Regressor


This code generates housing price predictions using the trained TabPFN regressor:

### Prediction Process
- `regressor.predict(X_test)`: Applies the trained model to make predictions on unseen test data
  - Takes the feature matrix `X_test` as input
  - Returns an array of predicted housing prices (MEDV values in $1000s)
  - No probability values are returned since this is a regression task (unlike classification)

### Behind the Scenes
When you call `predict()`, TabPFN:
1. Processes the input features through its transformer-based architecture
2. Converts the network outputs to continuous values appropriate for the regression task
3. Returns point estimates for each sample in the test set

### Next Steps After Prediction
After generating these predictions, you would typically:
- Evaluate model performance using metrics like MSE, RMSE, or R²
- Compare the predictions against actual values (y_test)
- Visualize the predictions vs. actual values to identify patterns or areas for improvement

TabPFN's unique approach often leads to competitive regression results with minimal configuration, making it an excellent choice for tabular regression tasks like housing price prediction.

In [None]:
# Predict on the test set
predictions = regressor.predict(X_test)

In [None]:
print(predictions)

# Evaluating the TabPFN Regressor


This code assesses the performance of the TabPFN regressor using two standard regression metrics:

### Mean Squared Error (MSE)
- `mean_squared_error(y_test, predictions)`: Calculates the average squared difference between predicted and actual housing prices
  - Formula: MSE = (1/n) * Σ(y_true - y_pred)²
  - Lower values indicate better model performance
  - Units are squared dollars (in thousands), making it scale-dependent
  - Penalizes larger errors more heavily due to the squaring operation
  - Useful for comparing models on the same dataset

### R² Score (Coefficient of Determination)
- `r2_score(y_test, predictions)`: Measures the proportion of variance in housing prices that the model explains
  - Formula: R² = 1 - (Σ(y_true - y_pred)² / Σ(y_true - y_mean)²)
  - Scale-free metric ranging from -∞ to 1
  - R² = 1 indicates perfect prediction
  - R² = 0 indicates the model performs no better than simply predicting the mean value
  - R² < 0 indicates the model performs worse than predicting the mean
  - Generally, R² > 0.7 is considered good for real estate price prediction

### Interpretation
- These metrics together provide complementary insights:
  - MSE gives an absolute measure of prediction error in squared units
  - R² provides a relative measure of how well the model captures the variance in housing prices
  - A good model should have low MSE and high R² values

For the Boston Housing dataset, these metrics help assess how accurately TabPFN can predict home values based on neighborhood characteristics.

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)