In [29]:
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [30]:
df = pd.read_csv("car_prices.csv", chunksize=10000)

cat_columns = [
    'make', 
    'model', 
    'trim', 
    'body', 
    'transmission', 
    'state', 
    'color', 
    'interior', 
]

numeric_column = [
    'year', 
    'condition', 
    'odometer', 
    'mmr', 
    'sellingprice',
    'days_since_sale'
]

In [31]:
def one_hot_encode(df, columns):
    """
    One-hot encodes specified columns in a pandas DataFrame.
    """
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

def normalize(df, columns):
    """
    Normalizes specified columns in a pandas DataFrame between 0 and 1.
    """
    for column in columns:
        df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df

In [32]:
import pandas as pd

chunksize = 10000
one_hot_encoded_data = pd.DataFrame()

for chunk in df:

    # Drop the 'vin' column from the chunk
    chunk.drop('vin', axis=1, inplace=True)

    # Convert the 'saledate' column to datetime with timezone
    chunk['saledate'] = pd.to_datetime(chunk['saledate'], utc=True, errors='coerce')

    # Remove the timezone information
    chunk['saledate'] = chunk['saledate'].dt.tz_localize(None)
    
    # Calculate the time since the sale using timezone-naive current datetime
    chunk['days_since_sale'] = pd.Timestamp.now(tz=None) - chunk['saledate']

    # Convert the timedelta to days
    chunk['days_since_sale'] = chunk['days_since_sale'].dt.days

    # Drop the 'saledate' column
    chunk.drop('saledate', axis=1, inplace=True)
    chunk.drop('seller', axis=1, inplace=True)

    # Continue with the rest of the processing
    
    # get sum of all values in each column
    
    for column in cat_columns:
    
        make_frequency = chunk[column].value_counts().sort_values(ascending=False)

        # Calculate the cumulative frequency distribution
        cumulative_frequency = make_frequency.cumsum() / make_frequency.sum()

        # Select categories in the top 80% of the frequency distribution
        selected_categories = cumulative_frequency[cumulative_frequency <= 0.8].index

        # Replace the other categories in the 'make' column with "other"
        chunk[column] = chunk[column].apply(lambda x: x if x in selected_categories else 'other')
    
    chunk = normalize(chunk, numeric_column)
    chunk = one_hot_encode(chunk, cat_columns)
    
    chunk.fillna(0.0, inplace=True)
    
    one_hot_encoded_data = pd.concat([one_hot_encoded_data, chunk], ignore_index=True)
    
    # print dimensions of the one-hot encoded data
    
one_hot_encoded_data.fillna(0.0, inplace=True)
print(one_hot_encoded_data)

            year  condition  odometer       mmr  sellingprice  days_since_sale  make_BMW  make_Chevrolet  make_Chrysler  make_Dodge  make_Ford  make_Honda  make_Hyundai  make_Infiniti  make_Kia  make_Mazda  make_Mercedes-Benz  make_Nissan  make_Toyota  make_Volkswagen  make_other  model_1500  model_200  model_2500  model_3 Series  model_300  model_4Runner  model_5 Series  model_6 Series  model_7 Series  model_A4  model_A6  model_Acadia  model_Accent  model_Accord  model_Altima  model_Avalanche  model_Avalon  model_Avenger  model_C-Class  model_CC  model_CLK-Class  model_CR-V  model_CTS  model_CX-5  model_CX-9  model_Caliber  model_Camaro  model_Camry  model_Challenger  ...  model_Juke  model_MKS  trim_E350 Sport  trim_Two  model_Savana Cargo  trim_EL King Ranch  trim_GLK350  trim_GT Premium  state_wa  trim_SLE-1  model_Outlander Sport  trim_TDI  model_Versa Note  trim_King Ranch  trim_LX Hybrid  trim_PreRunner V6  trim_SR  model_Mazda5  trim_LTZ 1500  trim_E350 Luxury 4MATIC  \
0      

In [33]:
training_data = one_hot_encoded_data.sample(frac=0.8, random_state=1)

In [34]:
validation_data = one_hot_encoded_data.drop(training_data.index)

In [35]:
import torch
tensor_x_training = torch.tensor(training_data.drop(columns=['sellingprice'], axis=1).values, dtype=torch.float32)
tensor_y_training = torch.tensor(training_data['sellingprice'].values, dtype=torch.float32)

breadth:  347198
length:  599
torch.Size([347198, 599])
torch.Size([347198])


In [36]:
import torch
tensor_x_validation = torch.tensor(validation_data.drop(columns=['sellingprice'], axis=1).values, dtype=torch.float32)
tensor_y_validation = torch.tensor(validation_data['sellingprice'].values, dtype=torch.float32)

In [None]:
from torch import nn
from torchsummary import summary
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.optim as optim

parameters = [8, 16, 32]
learning_rates = [0.000001, 0.00001, 0.0001]

for parameter in parameters:
    for learning_rate in learning_rates:
        
       
        # sequential layer
        model = nn.Sequential(

            nn.Linear(tensor_x_training.size[1], parameter),

            nn.ReLU(),
            nn.Linear(parameter, parameter),

            nn.ReLU(),
            nn.Linear(parameter, 1),

        )
        
         
        loss_fn = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        

        best_validation_loss = float('inf')
        
        while True:
            optimizer.zero_grad()
            outputs = model(tensor_x_training)
            loss = loss_fn(outputs, tensor_y_training)
            loss.backward()
            optimizer.step()

            # Calculate validation loss
            with torch.no_grad():
                validation_outputs = model(tensor_x_validation)
                validation_loss = loss_fn(validation_outputs, tensor_y_validation)

            # Check for early stopping
            if validation_loss < best_validation_loss:
                best_validation_loss = validation_loss
                counter = 0
            else:
                break 

In [None]:
# {'r2': 0.9330620578253821, 'parameter': 32, 'lr': 0.0001}

# model = nn.Sequential(
#     nn.Linear(tensor_x_training.shape[1], parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, parameter),
#     nn.ReLU(),
#     nn.Linear(parameter, 1),
# )