In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [17]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('final_test-1.csv')

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
training_label = train_data['sellingprice']
train_features = train_data.drop(['sellingprice'], axis=1)

test_label = test_data['sellingprice']
test_features = test_data.drop(['sellingprice'], axis=1)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(train_features, training_label)

# Task 2.2.with the trained model, use your the train set to make prediction
logSalePrice_train_linear = lin_reg.predict(train_features)
logSalePrice_train_linear = pd.Series(logSalePrice_train_linear,
                                      name="logSalePrice_train_linear")

# Task 2.3. report the mse score in your linear regression (train)
lin_mse_train = mean_squared_error(training_label, logSalePrice_train_linear)
lin_r2_train = r2_score(training_label, logSalePrice_train_linear)
print(f"The mean squared error: {lin_mse_train}")
print(f"The R^2 value is: {lin_r2_train}")

# Task 2.4. convert the predicted logPrice back to Price (train)
predict_price_train_linear = np.expm1(logSalePrice_train_linear)

# Task 2.5. convert the observed logPrice back to Price (train)
observed_price_train = np.expm1(training_label)
    
# obs_vs_pred_train(observed_price_train, predict_price_train_linear)

# Task 2.7. with the trained model, use the test set to make prediction
test_features = test_features[train_features.columns]
logSalePrice_test_linear = lin_reg.predict(test_features)

# Task 2.8. report the mse score in your linear regression (test)
lin_mse_test = mean_squared_error(test_label, logSalePrice_test_linear)
lin_r2_test = r2_score(test_label, logSalePrice_test_linear)
print(f"The mean squared error: {lin_mse_test}")
print(f"The R^2 value is: {lin_r2_test}")

# Task 2.9. convert the predicted logPrice back to Price (test)
predict_price_test_linear = np.expm1(logSalePrice_test_linear)

# Task 2.10 convert the observed logPrice back to Price (test)
observed_price_test = np.expm1(test_label)

# obs_vs_pred_test(observed_price_test, predict_price_test_linear)

In [14]:
# drop the columns that contain information about car color and market value
df = df.drop(columns=['color', 'mmr', 'trim'])

# Create feature interactions
df['age_odometer'] = df['year'] * df['odometer']

# Drop rows with missing values
df = df.dropna()

# drop all rows where the year is before 2000
df = df.drop(df[df['year'] < 2000].index)

# drop all rows where the odometer is greater than 300,000 and less than 1,000
df = df.drop(df[df['odometer'] > 300000].index)
df = df.drop(df[df['odometer'] < 1000].index)

# # drop all rows where the price is greater than 100,000
df = df.drop(df[df['sellingprice'] > 100000].index)

# # drop all rows where the price is less than 1,000
df = df.drop(df[df['sellingprice'] < 1000].index)

# # convert training data to a log price
df['sellingprice'] = np.log1p(df['sellingprice'])

# drop invalid vins
def vectorized_check_digit(vins):
    # Define weights
	weights = np.array([8,7,6,5,4,3,2,10,0,9,8,7,6,5,4,3,2])
    
    # Define transliteration dictionary for letters
	transliteration = {
		'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8,
		'j': 1, 'k': 2, 'l': 3, 'm': 4, 'n': 5, 'p': 7, 'r': 9, 's': 2,
		't': 3, 'u': 4, 'v': 5, 'w': 6, 'x': 7, 'y': 8, 'z': 9
	}

	# Convert VINs to character arrays
	vin_array = np.array([list(vin.lower()) for vin in vins])

	# Initialize array for numeric values
	numeric_values = np.zeros((len(vins), 17), dtype=int)

	# Process each character
	for i in range(17):
		chars = vin_array[:, i]
		
		# Handle digits
		is_digit = np.char.isdigit(chars)
		numeric_values[is_digit, i] = chars[is_digit].astype(int)
		
		# Handle letters
		is_letter = ~is_digit
		for letter, value in transliteration.items():
			letter_mask = (chars == letter)
			numeric_values[letter_mask, i] = value

	# Calculate weighted sum
	weighted_sums = np.sum(numeric_values * weights, axis=1)

	# Calculate check digits
	check_digits = weighted_sums % 11

	# Compare with actual check digits (position 8)
	actual_check = vin_array[:, 8]
	expected_check = np.where(check_digits == 10, 'x', check_digits.astype(str))

	# Return boolean mask of valid VINs
	return actual_check == expected_check

# Apply the vectorized function
valid_vins = vectorized_check_digit(df['vin'])
df = df[valid_vins]

# fill any missing transmission values with the mode
df['transmission'] = df['transmission'].fillna(df['transmission'].mode()[0])

# make each transmission a distinct number
unique_transmissions = df['transmission'].unique()
df['transmission_code'], unique_transmissions = pd.factorize(df['transmission'])
df.drop(columns=['transmission'], inplace=True)

# drop every row with two or more missing values
df = df.dropna(thresh=13)

# make all body types lowercase
df['body'] = df['body'].str.lower()

df['body'] = df['body'].apply(lambda x: 'convertible' if 'convertible' in str(x).lower() else x)
df['body'] = df['body'].apply(lambda x: 'pickup' if 'pickup' in str(x).lower() else x)

valid_types = ['sedan', 'hatchback', 'wagon', 'coupe', 'convertible', 'suv', 'pickup']
df['body'] = df['body'].apply(lambda x: 'other' if str(x).lower() not in valid_types else x)

unique_bodies = df['body'].unique()
df['body_code'], unique_bodies = pd.factorize(df['body'])
df.drop(columns=['body'], inplace=True)

# drop row if interior is —
df = df.drop(df[df['interior'] == '—'].index)

# replace all interior values that are not the 5 most popular with 'other'
top_5_interiors = 'black', 'gray', 'beige', 'tan', 'brown'
df['interior'] = df['interior'].apply(lambda x: 'other' if str(x).lower() not in top_5_interiors else x)

# make each interior a distinct number
unique_interiors = df['interior'].unique()
df['interior_code'], unique_interiors = pd.factorize(df['interior'])
df.drop(columns=['interior'], inplace=True)

# make each seller a distinct number
unique_sellers = df['seller'].unique()
df['seller_code'], unique_sellers = pd.factorize(df['seller'])
df.drop(columns=['seller'], inplace=True)

# drop if there are less than 500 cars sold by that seller
df = df.groupby('seller_code').filter(lambda x: len(x) > 500)

# make all makes lowercase
df['make'] = df['make'].str.lower()

# make each make a distinct number
unique_makes = df['make'].unique()
df['make_code'], unique_makes = pd.factorize(df['make'])
df.drop(columns=['make'], inplace=True)

# drop the row if there were less than 500 cars sold of that model
df = df.groupby('model').filter(lambda x: len(x) > 500)

# make each model a distinct number
unique_models = df['model'].unique()
df['model_code'], unique_models = pd.factorize(df['model'])
df.drop(columns=['model'], inplace=True)

# Group odometer by 1000s of miles
df['odometer_category'] = df['odometer'].astype(int) // 1000
df.loc[df['odometer'] % 1000 == 0, 'odometer_category'] -= 1
df.drop(columns=['odometer'], inplace=True)

# Convert saledate to quarter
df.loc[df['saledate'].str.lower().str.contains(r'jan|feb|mar'), "quarter"] = int(1)
df.loc[df['saledate'].str.lower().str.contains(r'apr|may|jun'), "quarter"] = int(2)
df.loc[df['saledate'].str.lower().str.contains(r'jul|aug|sep'), "quarter"] = int(3)
df.loc[df['saledate'].str.lower().str.contains(r'oct|nov|dec'), "quarter"] = int(4)
df.drop(columns=['saledate'], inplace=True)

# drop the row if there were less than 500 cars sold in that state
df = df.groupby('state').filter(lambda x: len(x) > 500)

# make each state a distinct number
unique_states = df['state'].unique()
df['state_code'], unique_states = pd.factorize(df['state'])
df.drop(columns=['state'], inplace=True)

# create a new column in the df named 'modelyear' that is the tenth digit in the vin
df['modelyear'] = df['vin'].str[9]

# make each modelyear a distinct number
unique_modelyears = df['modelyear'].unique()
df['modelyear_code'], unique_modelyears = pd.factorize(df['modelyear'])
df.drop(columns=['modelyear'], inplace=True)

In [15]:
# 2. Feature Selection
from xgboost import XGBRegressor
def select_features(X, y, threshold=0.01):
    gbr = GradientBoostingRegressor(random_state=42)
    gbr.fit(X, y)
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': gbr.feature_importances_
    })
    important_features = feature_importance[feature_importance['importance'] > threshold]['feature']
    return X[important_features]

# 3. Model Creation and Training
def create_models():
    models = {
        'xgboost': XGBRegressor(
            objective='reg:squarederror',
            random_state=42
        ),
        'gradient_boosting': GradientBoostingRegressor(
            random_state=42
        ),
        'random_forest': RandomForestRegressor(
            random_state=42
        )
    }
    
    param_grids = {
        'xgboost': {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators': [100, 200, 300],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.8, 0.9, 1.0]
        },
        'gradient_boosting': {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators': [100, 200, 300],
            'subsample': [0.8, 0.9, 1.0]
        },
        'random_forest': {
            'max_depth': [None, 10, 20],
            'n_estimators': [100, 200, 300],
            'min_samples_split': [2, 5, 10]
        }
    }
    
    return models, param_grids

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
import numpy as np

# 4. Model Training and Evaluation
def train_and_evaluate(X_train, X_test, y_train, y_test):
    models, param_grids = create_models()
    results = {}
    
    for name, model in models.items():
        # Create pipeline
        pipeline = Pipeline([
            ('scaler', RobustScaler()),
            ('model', model)
        ])
        
        # Perform grid search
        grid_search = GridSearchCV(
            pipeline,
            param_grid={'model__' + key: value for key, value in param_grids[name].items()},
            cv=5,
            scoring='r2',
            n_jobs=-1
        )
        
        # Fit model
        grid_search.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = grid_search.predict(X_train)
        y_test_pred = grid_search.predict(X_test)
        
        # Calculate metrics
        results[name] = {
            'train_r2': r2_score(y_train, y_train_pred),
            'test_r2': r2_score(y_test, y_test_pred),
            'train_mse': mean_squared_error(y_train, y_train_pred),
            'test_mse': mean_squared_error(y_test, y_test_pred),
            'best_params': grid_search.best_params_
        }
    return results

In [None]:
# make the vin number the id


# Split features and target
X = df.drop('sellingprice', axis=1)
y = df['sellingprice']

# Log transform target variable
y = np.log1p(y)

# Select important features
X = select_features(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
)

# Train and evaluate models
results = train_and_evaluate(X_train, X_test, y_train, y_test)

# Print results
for name, metrics in results.items():
	print(f"\nResults for {name}:")
	print(f"Train R² Score: {metrics['train_r2']:.4f}")
	print(f"Test R² Score: {metrics['test_r2']:.4f}")
	print(f"Train MSE: {metrics['train_mse']:.4f}")
	print(f"Test MSE: {metrics['test_mse']:.4f}")
	print("Best Parameters:", metrics['best_params'])

In [19]:
# print the length of predictions.csv
import pandas as pd

df = pd.read_csv("predictions.csv")
print(len(df))

148821
