In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def preprocess_for_transformer(file_path):
    # Load the dataset
    df = pd.read_csv(file_path, delimiter='|')
    
    # Handle missing values
    df.fillna(df.mean(), inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=['period_duration', 'city_y', 'date', 'merge_key', 'region_type_id'], inplace=True)
    
    # Rename 'city_x' to 'city'
    df.rename(columns={'city_x': 'city'}, inplace=True)
    
    # Convert 'period_begin' to datetime and sort
    df['period_begin'] = pd.to_datetime(df['period_begin'])
    df.sort_values(by='period_begin', inplace=True)
    
    # Extract year and month from 'period_begin'
    df['year'] = df['period_begin'].dt.year
    df['month'] = df['period_begin'].dt.month
    
    # One-hot encode categorical variables
    categorical_columns = ['city', 'property_type', 'region', 'division', 'state', 'measurement']
    df = pd.get_dummies(df, columns=categorical_columns)
    
    # Normalize the numerical features
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    # Extract the target variable and convert the DataFrame to a 2D array
    target = df['median_sale_price'].values
    df.drop(columns=['median_sale_price', 'period_begin'], inplace=True)
    features = df.values
    
    return features, target




In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# For our purposes, we might use a model like DistilBERT or BERT, which has been pretrained on text data.
# We will not use the tokenizer since we're dealing with numerical data, not text.
model_name = "distilbert-base-uncased" 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # We have one regression target


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from torch.utils.data import DataLoader, Dataset
import torch

class TimeSeriesDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float),
            'label': torch.tensor(self.targets[idx], dtype=torch.float)
        }

# Load and preprocess the training and validation data
train_features, train_targets = preprocess_for_transformer('train_housing.csv')
val_features, val_targets = preprocess_for_transformer('validation_housing.csv')

# Create the datasets
train_dataset = TimeSeriesDataset(train_features, train_targets)
val_dataset = TimeSeriesDataset(val_features, val_targets)

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


  df.fillna(df.mean(), inplace=True)
  df.fillna(df.mean(), inplace=True)


In [None]:
from transformers import AdamW
from tqdm import tqdm

# Define the loss function and optimizer
loss_fn = torch.nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training and evaluation steps
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    losses = []
    for batch in tqdm(data_loader):
        inputs, targets = batch['input'].to(device), batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs).logits.squeeze(-1)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            inputs, targets = batch['input'].to(device), batch['label'].to(device)
            outputs = model(inputs).logits.squeeze(-1)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())
    return np.mean(losses)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    val_loss = eval_model(model, val_loader, loss_fn, device)
    print(f'Train loss: {train_loss}, Val loss: {val_loss}')


In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def preprocess_for_transformer(file_path):
    # Load the dataset
    df = pd.read_csv(file_path, delimiter='|')
    
    # Handle missing values
    df.fillna(df.mean(), inplace=True)
    
    # Drop unwanted columns
    df.drop(columns=['period_duration', 'city_y', 'date', 'merge_key', 'region_type_id'], inplace=True)
    
    # Rename 'city_x' to 'city'
    df.rename(columns={'city_x': 'city'}, inplace=True)
    
    # Convert 'period_begin' to datetime and sort
    df['period_begin'] = pd.to_datetime(df['period_begin'])
    df.sort_values(by='period_begin', inplace=True)
    
    # Extract year and month from 'period_begin'
    df['year'] = df['period_begin'].dt.year
    df['month'] = df['period_begin'].dt.month
    
    # One-hot encode categorical variables
    categorical_columns = ['city', 'property_type', 'region', 'division', 'state', 'measurement']
    df = pd.get_dummies(df, columns=categorical_columns)
    
    # Normalize the numerical features
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    
    # Extract the target variable and convert the DataFrame to a 2D array
    target = df['median_sale_price'].values
    df.drop(columns=['median_sale_price', 'period_begin'], inplace=True)
    features = df.values
    
    return features, target




In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# For our purposes, we might use a model like DistilBERT or BERT, which has been pretrained on text data.
# We will not use the tokenizer since we're dealing with numerical data, not text.
model_name = "distilbert-base-uncased" 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # We have one regression target


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from torch.utils.data import DataLoader, Dataset
import torch

class TimeSeriesDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'input': torch.tensor(self.features[idx], dtype=torch.float),
            'label': torch.tensor(self.targets[idx], dtype=torch.float)
        }

# Load and preprocess the training and validation data
train_features, train_targets = preprocess_for_transformer('train_housing.csv')
val_features, val_targets = preprocess_for_transformer('validation_housing.csv')

# Create the datasets
train_dataset = TimeSeriesDataset(train_features, train_targets)
val_dataset = TimeSeriesDataset(val_features, val_targets)

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


  df.fillna(df.mean(), inplace=True)
  df.fillna(df.mean(), inplace=True)


In [4]:
from transformers import AdamW
from tqdm import tqdm

# Define the loss function and optimizer
loss_fn = torch.nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training and evaluation steps
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    losses = []
    for batch in tqdm(data_loader):
        inputs, targets = batch['input'].to(device), batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs).logits.squeeze(-1)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            inputs, targets = batch['input'].to(device), batch['label'].to(device)
            outputs = model(inputs).logits.squeeze(-1)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())
    return np.mean(losses)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    val_loss = eval_model(model, val_loader, loss_fn, device)
    print(f'Train loss: {train_loss}, Val loss: {val_loss}')




Epoch 1/3


  0%|          | 0/5804 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  0%|          | 0/5804 [00:00<?, ?it/s]


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)