In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv("car_prices.csv", chunksize=10000)

cat_columns = [
    'make', 
    'model', 
    'trim', 
    'body', 
    'transmission', 
    'state', 
    'color', 
    'interior', 
]

numeric_column = [
    'year', 
    'condition', 
    'odometer', 
    'mmr', 
    'sellingprice',
    'days_since_sale'
]

In [3]:
def one_hot_encode(df, columns):
    """
    One-hot encodes specified columns in a pandas DataFrame.
    """
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

def normalize(df, columns):
    """
    Normalizes specified columns in a pandas DataFrame between 0 and 1.
    """
    for column in columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

In [4]:
import pandas as pd

chunksize = 10000
one_hot_encoded_data = pd.DataFrame()

for chunk in df:

    # Drop the 'vin' column from the chunk
    chunk.drop('vin', axis=1, inplace=True)

    # Convert the 'saledate' column to datetime with timezone
    chunk['saledate'] = pd.to_datetime(chunk['saledate'], utc=True, errors='coerce')

    # Remove the timezone information
    chunk['saledate'] = chunk['saledate'].dt.tz_localize(None)
    
    # Calculate the time since the sale using timezone-naive current datetime
    chunk['days_since_sale'] = pd.Timestamp.now(tz=None) - chunk['saledate']

    # Convert the timedelta to days
    chunk['days_since_sale'] = chunk['days_since_sale'].dt.days

    # Drop the 'saledate' column
    chunk.drop('saledate', axis=1, inplace=True)
    chunk.drop('seller', axis=1, inplace=True)

    # Continue with the rest of the processing
    
    # get sum of all values in each column
    
    for column in cat_columns:
    
        make_frequency = chunk[column].value_counts().sort_values(ascending=False)

        # Calculate the cumulative frequency distribution
        cumulative_frequency = make_frequency.cumsum() / make_frequency.sum()

        # Select categories in the top 80% of the frequency distribution
        selected_categories = cumulative_frequency[cumulative_frequency <= 0.8].index

        # Replace the other categories in the 'make' column with "other"
        chunk[column] = chunk[column].apply(lambda x: x if x in selected_categories else 'other')
    
    chunk = normalize(chunk, numeric_column)
    chunk = one_hot_encode(chunk, cat_columns)
    
    chunk.fillna(0.0, inplace=True)
    
    one_hot_encoded_data = pd.concat([one_hot_encoded_data, chunk], ignore_index=True)
    
    # print dimensions of the one-hot encoded data
    
one_hot_encoded_data.fillna(0.0, inplace=True)
print(one_hot_encoded_data)

            year  condition  odometer       mmr  sellingprice  days_since_sale  make_BMW  make_Chevrolet  make_Chrysler  make_Dodge  make_Ford  make_Honda  make_Hyundai  make_Infiniti  make_Kia  make_Mazda  make_Mercedes-Benz  make_Nissan  make_Toyota  make_Volkswagen  make_other  model_1500  model_200  model_2500  model_3 Series  model_300  model_4Runner  model_5 Series  model_6 Series  model_7 Series  model_A4  model_A6  model_Acadia  model_Accent  model_Accord  model_Altima  model_Avalanche  model_Avalon  model_Avenger  model_C-Class  model_CC  model_CLK-Class  model_CR-V  model_CTS  model_CX-5  model_CX-9  model_Caliber  model_Camaro  model_Camry  model_Challenger  ...  model_Juke  model_MKS  trim_E350 Sport  trim_Two  model_Savana Cargo  trim_EL King Ranch  trim_GLK350  trim_GT Premium  state_wa  trim_SLE-1  model_Outlander Sport  trim_TDI  model_Versa Note  trim_King Ranch  trim_LX Hybrid  trim_PreRunner V6  trim_SR  model_Mazda5  trim_LTZ 1500  trim_E350 Luxury 4MATIC  \
0      

In [5]:
training_data = one_hot_encoded_data.sample(frac=0.8, random_state=1)

In [6]:
validation_data = one_hot_encoded_data.drop(training_data.index)

In [7]:
import torch
tensor_x_training = torch.tensor(training_data.drop(columns=['sellingprice'], axis=1).values, dtype=torch.float32)
tensor_y_training = torch.tensor(training_data['sellingprice'].values, dtype=torch.float32)

print(tensor_x_training.shape[1])

599


In [8]:
import torch
tensor_x_validation = torch.tensor(validation_data.drop(columns=['sellingprice'], axis=1).values, dtype=torch.float32)
tensor_y_validation = torch.tensor(validation_data['sellingprice'].values, dtype=torch.float32)

In [12]:
from torch import nn
from torchsummary import summary
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import torch.optim as optim
# import time
import time

# parameters = [8, 16, 32, 64]
# learning_rates = [0.0001, 0.001, 0.01]
# 
# r2_min = {"r2": -1, "parameter": 0, "learning_rate": 0}

#for parameter in parameters:
#    for learning_rate in learning_rates:

parameter = 32
learning_rate = 0.0001
        
# print("current r2 min : ", r2_min)

# start timer
start = time.time()

model = nn.Sequential(

    nn.Linear(tensor_x_training.shape[1], parameter),

    nn.ReLU(),
    nn.Linear(parameter, parameter),

    nn.ReLU(),
    nn.Linear(parameter, parameter),

    nn.ReLU(),
    nn.Linear(parameter, 1),

)

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

best_validation_loss = float('inf')

while True:
    optimizer.zero_grad()
    outputs = model(tensor_x_training)
    loss = loss_fn(outputs, tensor_y_training.unsqueeze(1))
    loss.backward()
    optimizer.step()

    # Calculate validation loss
    with torch.no_grad():
        validation_outputs = model(tensor_x_validation)
        validation_loss = loss_fn(validation_outputs, tensor_y_validation.unsqueeze(1))

        r2 = r2_score(tensor_y_validation.numpy(), validation_outputs.numpy())

#        if r2 > r2_min["r2"]:
#            r2_min["r2"] = r2
#            r2_min["parameter"] = parameter
#            r2_min["learning_rate"] = learning_rate

    # check for early stopping
    if validation_loss >= best_validation_loss:
        break 
    
    # stop if the timer has reached 9 minutes
    if time.time() - start > 540:
        print("timer has reached 9 minutes")
        print("r2 min : ", r2)
        break

    best_validation_loss = validation_loss
            

# print("r2 min : ", r2_min)

testing out parameter:  32  and learning rate:  0.0001


In [None]:
# plot the validation set against the output of the model

import matplotlib.pyplot as plt

import torch

# Check if GPU is available and use it, otherwise use the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Generate model predictions on the validation set
with torch.no_grad():
    tensor_x_validation = tensor_x_validation.to(device)
    model.eval()  # Set the model to evaluation mode
    validation_outputs = model(tensor_x_validation).cpu().numpy()

# Convert the target tensor to a NumPy array
validation_targets = tensor_y_validation.cpu().numpy()

# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(range(validation_targets.shape[0]), validation_targets, label='Actual')
plt.scatter(range(validation_outputs.shape[0]), validation_outputs, label='Predicted', marker='x')
plt.xlabel('Validation Data Index')
plt.ylabel('Target Value')
plt.legend()
plt.title('Actual vs. Predicted Values for the Validation Set')
plt.show()



In [None]:
# model = nn.Sequential(

#     nn.Linear(tensor_x_training.shape[1], parameter),

#     nn.ReLU(),
#     nn.Linear(parameter, parameter),

#     nn.ReLU(),
#     nn.Linear(parameter, 1),
# )

# after 23 minutes, 8 parameters
# r2:  0.9022029945690702
# best validation loss:  tensor(0.0007)  validation loss:  tensor(0.0007)

# after 13 minutes, 16 parameters
# r2:  0.9024288425404208
# best validation loss:  tensor(0.0007)  validation loss:  tensor(0.0007)

# after 9 minutes, 32 parameters
# r2:  0.9110110570893579
# best validation loss:  tensor(0.0006)  validation loss:  tensor(0.0006)

# after 9 minutes, 64 parameters
#r2:  0.8424793580733347
#best validation loss:  tensor(0.0011)  validation loss:  tensor(0.0011)

In [None]:
# model = nn.Sequential(

#     nn.Linear(tensor_x_training.shape[1], parameter),

#     nn.ReLU(),
#     nn.Linear(parameter, parameter),

#     nn.ReLU(),
#     nn.Linear(parameter, parameter),

#     nn.ReLU(),
#     nn.Linear(parameter, 1),

# )

# after 9 minutes, 8 parameters

# timer has reached 9 minutes
# r2 min :  0.8209289690997417

# current r2 min :  {'r2': 0.872825849225731, 'parameter': 8, 'learning_rate': 0.001}

# testing out parameter:  8  and learning rate:  0.01
# current r2 min :  {'r2': 0.872825849225731, 'parameter': 8, 'learning_rate': 0.001}
# testing out parameter:  16  and learning rate:  0.0001
# current r2 min :  {'r2': 0.872825849225731, 'parameter': 8, 'learning_rate': 0.001}
# testing out parameter:  16  and learning rate:  0.001
# current r2 min :  {'r2': 0.9359466184869851, 'parameter': 16, 'learning_rate': 0.0001}
# testing out parameter:  16  and learning rate:  0.01
# current r2 min :  {'r2': 0.9359466184869851, 'parameter': 16, 'learning_rate': 0.0001}
# testing out parameter:  32  and learning rate:  0.0001
# current r2 min :  {'r2': 0.9359466184869851, 'parameter': 16, 'learning_rate': 0.0001}
# testing out parameter:  32  and learning rate:  0.001
# current r2 min :  {'r2': 0.9362101278204871, 'parameter': 32, 'learning_rate': 0.0001}


# r2 min :  {'r2': 0.9362101278204871, 'parameter': 32, 'learning_rate': 0.0001}
