### Imports

In [147]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

### Read data

In [148]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

### Check null values

In [149]:
df_train.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

### Fill in columns with null values

In [150]:
# fill NAs
df_train['Product_Category_2'].fillna(-1, inplace=True)
df_train['Product_Category_3'].fillna(-1, inplace=True)

In [151]:
df_test['Product_Category_2'].fillna(-1, inplace=True)
df_test['Product_Category_3'].fillna(-1, inplace=True)

### Perform label encoding on columns with ordinal categories or only 2 unique values

In [152]:
# label encode
label_encoder = LabelEncoder()
custom_age_order = ['0-17','18-25','26-35','36-45', '46-50', '51-55', '55+']
label_encoder.fit(custom_age_order)
df_train['Age_encoded'] = label_encoder.transform(df_train['Age'])

custom_stay_order = ['0','1','2','3', '4+']
label_encoder.fit(custom_stay_order)
df_train['Stay_In_Current_City_Years_encoded'] = label_encoder.transform(df_train['Stay_In_Current_City_Years'])
df_train['Gender_encoded'] = label_encoder.fit_transform(df_train['Gender'])
df_train['Marital_Status_encoded'] = label_encoder.fit_transform(df_train['Marital_Status'])

In [153]:
# label encode
label_encoder = LabelEncoder()
custom_age_order = ['0-17','18-25','26-35','36-45', '46-50', '51-55', '55+']
label_encoder.fit(custom_age_order)
df_test['Age_encoded'] = label_encoder.transform(df_test['Age'])

custom_stay_order = ['0','1','2','3', '4+']
label_encoder.fit(custom_stay_order)
df_test['Stay_In_Current_City_Years_encoded'] = label_encoder.transform(df_test['Stay_In_Current_City_Years'])
df_test['Gender_encoded'] = label_encoder.fit_transform(df_test['Gender'])
df_test['Marital_Status_encoded'] = label_encoder.fit_transform(df_test['Marital_Status'])

### Perform one hot encoding for nominal categories

In [154]:
# ohe
df_train = pd.get_dummies(df_train, columns=['Occupation', 'City_Category', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3'])

In [155]:
# ohe
df_test = pd.get_dummies(df_test, columns=['Occupation', 'City_Category', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3'])

### Prepare data for training by dropping columns that have been encoded

In [156]:
# We also drop User_ID and Product_ID even thought they haven't been encoded. This is because we should never use these for predictions!
X = df_train.drop(columns=['Purchase', 'User_ID', 'Product_ID', 'Gender', 'Age',  'Stay_In_Current_City_Years', 'Marital_Status'], axis=1)
X = X.drop(columns=['Product_Category_1_19', 'Product_Category_1_20'], axis=1) # Drop these two since they are not present in the test.csv file
y = df_train['Purchase']

In [157]:
# We also drop User_ID and Product_ID even thought they haven't been encoded. This is because we should never use these for predictions!
X_final_prediction = df_test.drop(columns=['User_ID', 'Product_ID', 'Gender', 'Age',  'Stay_In_Current_City_Years', 'Marital_Status'], axis=1)

In [158]:
# Create a Linear Regression model
model = LinearRegression()

# Define the number of folds (k)
k = 5

# Create a k-fold cross-validation object
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Perform k-fold cross-validation and get the R-squared scores
scores = cross_val_score(model, X, y, cv=kf, scoring='r2')

# Print the R-squared scores for each fold
for i, score in enumerate(scores):
    print(f"Fold {i + 1} R-squared: {score:.4f}")

# Calculate the mean R-squared score across all folds
mean_r2 = scores.mean()
print(f"Mean R-squared: {mean_r2:.4f}")

Fold 1 R-squared: 0.6419
Fold 2 R-squared: 0.6469
Fold 3 R-squared: 0.6498
Fold 4 R-squared: 0.6468
Fold 5 R-squared: 0.6529
Mean R-squared: 0.6476


In [159]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the multivariable linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [160]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R^2) Value: {r2}")

Mean Squared Error (MSE): 8997621.561582703
Root Mean Squared Error (RMSE): 2999.603567403983
R-squared (R^2) Value: 0.6419025263169345


In [161]:
# Make predictions on the testing set
y_pred = model.predict(X_final_prediction)