In [None]:
import seaborn as sns
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.imputation import mice

# load and clean cars data
df1 = pd.read_csv("cars.csv")

#clean data
df1["Mileage"] = df1["Mileage"].str.rstrip(" kmpl")
df1["Mileage"] = df1["Mileage"].str.rstrip(" km/g")
df1["Engine"] = df1["Engine"].str.rstrip(" CC")
df1["Power"] = df1["Power"].str.rstrip(" bhp")
df1["Power"]= df1["Power"].replace(regex="null", value = np.nan)
df1["Fuel_Type"]=df1["Fuel_Type"].astype("category")
df1["Transmission"]=df1["Transmission"].astype("category")
df1["Owner_Type"]=df1["Owner_Type"].astype("category")
df1["Mileage"]=df1["Mileage"].astype("float")
df1["Power"]=df1["Power"].astype("float")
df1["Engine"]=df1["Engine"].astype("float")
df1["Company"]=df1["Name"].str.split(" ").str[0]
df1["Model"]=df1["Name"].str.split(" ").str[1]+df1["Name"].str.split(" ").str[2]

# Normalize price using log transformation
df1['Price_Norm'] = df1['Price'].apply(lambda x: np.log(x))

# Plot the price data before and after log transformation
fig, axes = plt.subplots(1, 2, figsize = (10,8))

fig.suptitle('Price Before and After Log Transformation')

sns.histplot(df1['Price'], kde = True, ax = axes[0])
axes[0].set_title('Original Price')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Frequency')

sns.histplot(df1['Price_Norm'], kde = True, ax = axes[1])
axes[1].set_title('Norm Price')
axes[1].set_xlabel('Price')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
def missing_values(df):
    numrows = len(df)
    for col in df.columns:
        num_vals = df1[col].count()
        num_missing = numrows - num_vals
        print(f"{col}: {num_vals} out of {numrows}. {num_missing} missing values.")

missing_values(df1)
df1.head()
df1.describe()

Power is missing 143 values.

Engine is missing 36 values.

The column with the most missing values is New_Price, with 5,195. We can drop this column.

In [None]:
# drop na values
df1_cleaned = df1[['Power', 'Engine', 'Mileage','Kilometers_Driven','Year','Price_Norm']].dropna()
X1 = df1_cleaned[['Power', 'Engine', 'Mileage','Kilometers_Driven','Year']]
y1 = df1_cleaned['Price_Norm']

# fit OLS model with intercept and 4 Betas
X1 = sm.add_constant(X1)
price_model1 = sm.OLS(y1,X1).fit()
price_model1.summary()

After dropping the rows with missing values, the model was trained on 5,874 observations. The model has an R-Squared value of 0.833, meaning 83.3% of the variance is explained by the model. The P-Values for both Mileage and KM Driven are high, indicating that neither of them are statistically significant.

In [None]:
# replace missing values with column means
# df2 = df1.fillna(df1.mean()) <-- got an error because name column is string

# Create new dataframe first
df_avgs = df1[['Power','Engine', 'Mileage','Kilometers_Driven','Year','Price_Norm']]
df_avgs = df_avgs.fillna(df_avgs.mean())
df_avgs.describe()

# create model
X2 = df_avgs[['Power', 'Engine', 'Mileage', 'Kilometers_Driven', 'Year']]
y2 = df_avgs[['Price_Norm']]
# fit OLS model with intercept and 4 Betas
X2 = sm.add_constant(X2)
price_model2 = sm.OLS(y2,X2).fit()
price_model2.summary()

6019 rows were used to train the model. The R-Squared is now 0.828, so 82.8% of the variance is now explained. The P-Values for Mileage and KM Driven are still high, so these coefficients are still not significant to the model.

In [None]:
mice_df = df1[['Power', 'Engine', 'Mileage', 'Kilometers_Driven', 'Year', 'Price_Norm']]
imp = mice.MICEData(mice_df)
fml = 'Price_Norm ~ Power + Engine + Mileage + Kilometers_Driven + Year'
mice = mice.MICE(fml, sm.OLS, imp)
results3 = mice.fit(10,10)
results3.summary()

From the original model, the parameters have changed slightly. The intercept has moved slightly lower, mileage and KM_Driven have less effect and are still not statistically significant, and the year impacts the price by about 9.

In [None]:
from statsmodels.imputation import mice

mice_df = df1[['Power', 'Mileage', 'Kilometers_Driven', 'Year', 'Engine', 'Price_Norm']]
imp = mice.MICEData(mice_df)

# Plot the price data before and after log transformation
fig, axes = plt.subplots(2, 2, figsize = (10,8))

fig.suptitle('Power and Engine With and Without Mice')

sns.histplot(df1['Power'], kde = True, ax = axes[0,0])
axes[0,0].set_title('Original Power')
axes[0,0].set_xlabel('Power')
axes[0,0].set_ylabel('Frequency')

sns.histplot(imp.data['Power'], kde = True, ax = axes[0,1])
axes[0,1].set_title('Power with Mice')
axes[0,1].set_xlabel('Power')
axes[0,1].set_ylabel('Frequency')

sns.histplot(df1['Engine'], kde = True, ax = axes[1,0])
axes[1,0].set_title('Original Engine')
axes[1,0].set_xlabel('Engine')
axes[1,0].set_ylabel('Frequency')

sns.histplot(imp.data['Engine'], kde = True, ax = axes[1,1])
axes[1,1].set_title('Engine with Mice')
axes[1,1].set_xlabel('Engine')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()

In [None]:
# Power overlay
sns.histplot(df1['Power'], kde=True, color='blue', label='Original Power', bins=20, alpha=0.5)
sns.histplot(imp.data['Power'], kde=True, color='orange', label='Power with MICE', bins=20, alpha=0.5)
plt.title('Distribution of Power: Original vs. MICE Imputed')
plt.xlabel('Power')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Engine overlay
plt.figure(figsize=(12,5))
sns.histplot(df1['Engine'], kde=True, color='blue', label='Original Engine', bins=20, alpha=0.5)
sns.histplot(imp.data['Engine'], kde=True, color='orange', label='Engine with MICE', bins=20, alpha=0.5)
plt.title('Distribution of Engine: Original vs. MICE Imputed')
plt.xlabel('Engine')
plt.ylabel('Frequency')
plt.legend()
plt.show()

**Section 2: Predicting Customer Spending**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

customers = pd.read_csv("customers.csv")

#get dummy variable columns
customers = pd.get_dummies(data=customers, columns=['sex', 'race'], prefix=['sex','race'])

customers["Asian_Female"] = np.multiply(customers["race_asian"],customers["sex_female"])
customers["Black_Female"] = np.multiply(customers["race_black"],customers["sex_female"])
customers["Hispanic_Female"] = np.multiply(customers["race_hispanic"],customers["sex_female"])
customers["Other_Female"] = np.multiply(customers["race_other"],customers["sex_female"])
customers["White_Female"] = np.multiply(customers["race_white"],customers["sex_female"])
customers["Asian_Male"] = np.multiply(customers["race_asian"],customers["sex_male"])
customers["Black_Male"] = np.multiply(customers["race_black"],customers["sex_male"])
customers["Hispanic_Male"] = np.multiply(customers["race_hispanic"],customers["sex_male"])
customers["Other_Male"] = np.multiply(customers["race_other"],customers["sex_male"])
customers["White_Male"] = np.multiply(customers["race_white"],customers["sex_male"])
customers["Asian_Other"] = np.multiply(customers["race_asian"],customers["sex_other"])
customers["Black_Other"] = np.multiply(customers["race_black"],customers["sex_other"])
customers["Hispanic_Other"] = np.multiply(customers["race_hispanic"],customers["sex_other"])
customers["Other_Other"] = np.multiply(customers["race_other"],customers["sex_other"])
customers["White_Other"] = np.multiply(customers["race_white"],customers["sex_other"])
customers.head()

# create dependent and independent variables
X = customers.drop(columns = ['spend', 'sex_female', 'sex_male', 'sex_other', 'race_asian', 'race_black', 'race_hispanic', 'race_other', 'race_white', 'Other_Other'])
y = customers['spend']

# Create the train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Linear regression
linreg = LinearRegression()
model = linreg.fit(X_train, y_train)

# Create a df of coefficients
linreg_coef = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': np.round(model.coef_, 3)
})

print(linreg_coef)

#calculate R2 on the test set
linreg_r2 = linreg.score(X_test, y_test)
print("R^2:", linreg_r2)

In [None]:
from sklearn.linear_model import Ridge, RidgeCV

# scale X data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create the train/test split
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

# Set range of alphas
alphas = np.logspace(-3,3,50)

ridge_coef = []

for alpha in alphas:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_scaled, y_train)
    ridge_coef.append(ridge_model.coef_)

plt.figure(figsize = (10,6))

# convert to numpy array
ridge_coef = np.array(ridge_coef)

# Plot each feature's coefficient across alphas
for i in range(ridge_coef.shape[1]):
    plt.plot(alphas, ridge_coef[:,i], label = X.columns[i])

plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficient Value')
plt.title('Ridge Coefficients as a Function of Alpha')
plt.legend()
plt.grid(True)
plt.show()

# Find optimal alpha
ridge_cv = RidgeCV(alphas=alphas, cv=None)  # Leave-One-Out CV by default
ridge_cv.fit(X_train_scaled, y_train)
print("Optimal Ridge alpha:", ridge_cv.alpha_)

# Fit final model with optimal alpha - 0.001
ridge_final = Ridge(alpha = 0.001)
ridge_final.fit(X_train_scaled, y_train)

ridge_coef = dict(zip(X.columns, ridge_final.coef_))

for feature, coef in ridge_coef.items():
    print(f"{feature}: {coef:.3f}")

#R^2 on test set
ridge_r2 = ridge_final.score(X_test_scaled, y_test)
print("Ridge R^2 on test set:", ridge_r2)

In [None]:
# Set range of alphas
alphas = np.logspace(-2,2,50)

lasso_coef = []

for alpha in alphas:
    lasso_model = Lasso(alpha=alpha, max_iter = 10000)
    lasso_model.fit(X_train_scaled, y_train)
    lasso_coef.append(lasso_model.coef_)

plt.figure(figsize = (10,6))

# convert to numpy array
lasso_coef = np.array(lasso_coef)

# Plot each feature's coefficient across alphas
for i in range(lasso_coef.shape[1]):
    plt.plot(alphas, lasso_coef[:,i], label = X.columns[i])

plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficient Value')
plt.title('Lasso Coefficients as a Function of Alpha')
plt.legend()
plt.grid(True)
plt.show()

lasso_cv = LassoCV(alphas=alphas, cv = 5, max_iter=10000)
lasso_cv.fit(X_train_scaled, y_train)
print("Optimal Lasso alpha:", lasso_cv.alpha_)

# Fit final model with optimal alpha - 0.001
lasso_final = Lasso(alpha = 0.012)
lasso_final.fit(X_train_scaled, y_train)

lasso_coef = dict(zip(X.columns, lasso_final.coef_))

for feature, coef in lasso_coef.items():
    print(f"{feature}: {coef:.3f}")

#R^2 on test set
lasso_r2 = lasso_final.score(X_test_scaled, y_test)
print("Lasso R^2 on test set:", lasso_r2)

Comparing the coefficients between the three models, the obvious similarity is that gender: female has a negative impact on spend for all races, gender: male has a positive impact. Most differences occur with gender: other. In the linear regression and ridge models, black and white other have a positive impact on spending, but hispanic and asian other have no impact. In the lasso model, white other has a slight negative impact on spending, and all other races of other are dropped from the model.

The linear regression and ridge models have almost the same R2, 86.33%. This makes sense considering the optimal alpha for the ridge model was 0.001, which means the model benefits from minimal regularization. The lasso model has the highest, but only slightly higher at 86.36%.

I would choose the Lasso Model, as it has the highest R-Squared and the method removes any additional noise from the model and simplifies it by minimizing the impact of unnecessary coefficients.

Both the linear regression and the lasso model dropped Race: Asian and Hispanic of Gender: Other. The Lasso model also dropped Race: Black, Gender: Other. It did however include income, which was minimized in the linear regression model.