In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv("car_prices.csv", chunksize=10000)

cat_columns = [
    'make', 
    'model', 
    'trim', 
    'body', 
    'transmission', 
    'state', 
    'color', 
    'interior', 
]

numeric_column = [
    'year', 
    'condition', 
    'odometer', 
    'mmr', 
    'sellingprice',
    'days_since_sale'
]

# drop a column




In [3]:
def one_hot_encode(df, columns):
    """
    One-hot encodes specified columns in a pandas DataFrame.
    """
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

def normalize(df, columns):
    """
    Normalizes specified columns in a pandas DataFrame between 0 and 1.
    """
    for column in columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

In [4]:




import pandas as pd

total = 0

chunksize = 10000
one_hot_encoded_data = pd.DataFrame()

for chunk in df:

    # Drop the 'vin' column from the chunk
    chunk.drop('vin', axis=1, inplace=True)

    # Convert the 'saledate' column to datetime with timezone
    chunk['saledate'] = pd.to_datetime(chunk['saledate'], utc=True, errors='coerce')

    # Remove the timezone information
    chunk['saledate'] = chunk['saledate'].dt.tz_localize(None)
    
    # Calculate the time since the sale using timezone-naive current datetime
    chunk['days_since_sale'] = pd.Timestamp.now(tz=None) - chunk['saledate']

    # Convert the timedelta to days
    chunk['days_since_sale'] = chunk['days_since_sale'].dt.days

    # Drop the 'saledate' column
    chunk.drop('saledate', axis=1, inplace=True)
    chunk.drop('seller', axis=1, inplace=True)

    # Continue with the rest of the processing
    
    chunk = normalize(chunk, numeric_column)
    chunk = one_hot_encode(chunk, cat_columns)
    
    chunk.fillna(0.0, inplace=True)
    
    one_hot_encoded_data = pd.concat([one_hot_encoded_data, chunk], ignore_index=True)
    
    # print dimensions of the one-hot encoded data
    
    total += len(chunk)
    
    if total > 100000:
        break

one_hot_encoded_data.fillna(0.0, inplace=True)
print(one_hot_encoded_data)

KeyboardInterrupt: 

In [None]:
training_data = one_hot_encoded_data.sample(frac=0.8, random_state=1)

In [None]:
validation_data = one_hot_encoded_data.drop(training_data.index)

In [None]:
import torch
tensor_x_training = torch.tensor(training_data.drop(columns=['sellingprice'], axis=1).values, dtype=torch.float32)
tensor_y_training = torch.tensor(training_data['sellingprice'].values, dtype=torch.float32)

print(tensor_x_training.shape)
print(tensor_y_training.shape)

KeyboardInterrupt: 

In [None]:
import torch

tensor_x_validation = torch.tensor(validation_data.drop(columns=['sellingprice'], axis=1).values, dtype=torch.float32)
tensor_y_validation = torch.tensor(validation_data['sellingprice'].values, dtype=torch.float32)

In [None]:
import torch.optim as optim
from torch import nn
from torchsummary import summary
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.optim as optim

parameters = [32, 64, 128]
learning_rate = [0.0001, 0.001, 0.01]
patiences = [0, 1, 2]
epochs = [100, 200, 300]
r2_min = {"r2": -1, "parameter": 0, "lr": 0, "patience": 0, "ep": 0}

for parameter in parameters:
    for lr in learning_rate:
        for patience in patiences:
            for ep in epochs:
                
                print(r2_min)
                # sequential layer
                model = nn.Sequential(

                    nn.Linear(2804, parameter),

                    nn.ReLU(),
                    
                    nn.Linear(parameter, parameter),
                    
                    nn.ReLU(),
                    nn.Linear(parameter, parameter),
                    
                    nn.ReLU(),
                    nn.Linear(parameter, parameter),
                    
                    nn.ReLU(),

                    nn.Linear(parameter, parameter),
                    
                    nn.ReLU(),

                    nn.Linear(parameter, parameter),
                    
                    nn.ReLU(),

                    nn.Linear(parameter, parameter),
                    
                    nn.ReLU(),



                    nn.Linear(parameter, 1),

                )

                criterion = nn.MSELoss()

                optimizer = optim.Adam(model.parameters(), lr=lr)

                best_validation_loss = float('inf')
                counter = 0

                for epoch in range(ep):  # maximum number of epochs
                    optimizer.zero_grad()
                    outputs = model(tensor_x_training)
                    loss = criterion(outputs, tensor_y_training.unsqueeze(1))
                    loss.backward()
                    optimizer.step()

                    # Calculate validation loss
                    with torch.no_grad():
                        validation_outputs = model(tensor_x_validation)
                        validation_loss = criterion(validation_outputs, tensor_y_validation.unsqueeze(1))

                    # Check for early stopping
                    if validation_loss < best_validation_loss:
                        best_validation_loss = validation_loss
                        counter = 0
                    else:
                        counter += 1
                        if counter >= patience:
                            break

                #plt.plot(losses)

                from sklearn.metrics import r2_score

                with torch.no_grad():
                    validation_outputs = model(tensor_x_validation)
                    r2 = r2_score(tensor_y_validation.numpy(), validation_outputs.numpy())

                    if r2 > r2_min['r2']:
                        r2_min['r2'] = r2
                        r2_min['parameter'] = parameter
                        r2_min['lr'] = lr
                        r2_min['patience'] = patience
                        r2_min['ep'] = ep

                #print(f"R^2 Score: {r2}")
        
print("stopping")
print(r2_min)


{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -1, 'parameter': 0, 'lr': 0, 'patience': 0, 'ep': 0}
{'r2': -0.7892859600056417, 'parameter': 4, 'lr': 0.0001, 'patience': 2, 'ep': 2}
{'r2': -0.7892859600056417, 'parameter': 4, 'lr': 0.0001, 'patience': 2, 'ep': 2}
{'r2': -0.7892859600056417, 'parameter': 4, 'lr': 0.0001, 'patience': 2, 'ep': 2}
{'r2': -0.07678851099158268, 'para

KeyboardInterrupt: 