### Import Data and Load Modules

In [2]:
# Import modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn import neighbors
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np

In [3]:
# Read data
df = pd.read_excel('HW4.xlsx', sheet_name='All Data')

# Create a new DataFrame for feature engineering
df_new = df.copy()

In [4]:
# Whenever applicable use random state 42 (10 points).
np.random.seed(42) # ensure reproducability

### Feature Engineering

In [5]:
# Perform binning, and time series feature engineering steps:

# 1. Add a new column that bins "Freq" into the categories low, medium, and high
# bins = [0, 50, 200, float('inf')]  # Define the bin edges
# labels = ['Low', 'Medium', 'High']  # Define bin labels
# df_new['Freq_category'] = pd.cut(df_new['Freq'], bins=bins, labels=labels)

# 2. Extract the month and year from the last_update_days_ago and 1st_update_days_ago features
df_new['Last_Update_Month'] = pd.to_datetime(df_new['last_update_days_ago'], unit='D').dt.month
df_new['Last_Update_Year'] = pd.to_datetime(df_new['last_update_days_ago'], unit='D').dt.year
df_new['First_Update_Month'] = pd.to_datetime(df_new['1st_update_days_ago'], unit='D').dt.month
df_new['First_Update_Year'] = pd.to_datetime(df_new['1st_update_days_ago'], unit='D').dt.year

# 3. Create a new column that captures the time between the first and last update
df_new['Update_Time_Difference'] = df_new['last_update_days_ago'] - df_new['1st_update_days_ago']


In [6]:
# Create the train-test split:

# Split the data into features (X) and target variable (y)
X = df_new.drop(columns=['Spending'])
y = df_new['Spending']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Scale features:
from sklearn.preprocessing import StandardScaler # standardize features by removing the mean and scaling to unit variance
                                                 # the standard score of a sample x is calculated as: z = (x - u) / s
                                                 # where u is the mean of the training sample , and s is the standard deviation of the training sample.
                                                 # centering and scaling happens independently on each feature.

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)         # computes the mean and std to be used for scaling and performs scaling
X_test = scaler.transform(X_test)               # performs standardization of test set X attributes by centering and scaling

## Linear Regression

In [8]:
# Fit a Linear Regression Model
slr2 = LinearRegression()
slr2.fit(X_train, y_train)
y_train_pred = slr2.predict(X_train)
y_test_pred = slr2.predict(X_test)
print('Slope: %.3f', slr2.coef_)                       # estimated coefficients for the linear regression model

Slope: %.3f [-1.32763995e+00 -6.43710208e-01  1.92641053e+00 -1.23465415e+01
 -3.74455153e+00 -1.02536636e+01 -8.69505959e+00 -4.90005966e+00
  1.04034321e+00 -1.74843183e+01  3.58505984e+00 -5.71075075e+00
 -4.10877163e+00 -2.42582651e+00 -5.97642497e+00 -2.94043316e+00
 -3.10741172e+00  1.12184758e+02 -1.43157179e+12  1.34610059e+12
 -1.16815959e+00 -5.61240414e-01 -2.72456286e+01  4.35282135e+01
  9.72950369e+00  3.51398546e+00  2.15189681e+01  2.80718320e+02
  8.76621276e+11]


In [9]:
# Evaluate Linear Regression model
from sklearn.metrics import mean_absolute_error # mean absolute error regression loss
from sklearn.metrics import mean_squared_error  # mean squared error regression loss

# See all regression metrics here http://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
print('MSE train: %.3f, test: %.3f' % ( # mean_absolute_error
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))  # y_test: Ground truth (correct) target values
                                                   # y_test_pred: Estimated target values

print('RMSE train: %.3f, test: %.3f' % ( #RMSE
        sqrt(mean_squared_error(y_train, y_train_pred)),
        sqrt(mean_squared_error(y_test, y_test_pred))))

print('MAE train: %.3f, test: %.3f' % ( # mean_squared_error
        mean_absolute_error(y_train, y_train_pred),
        mean_absolute_error(y_test, y_test_pred))) # y_test: Ground truth (correct) target values
                                                   # y_test_pred: Estimated target values


MSE train: 14468.773, test: 14967.014
RMSE train: 120.286, test: 122.340
MAE train: 69.924, test: 68.197


In [10]:
# Use cross-validation with 10 folds to estimate the generalization performance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10,          # number of re-shuffling & splitting iterations
                  test_size=0.3
                  ,random_state=42)

scores = cross_val_score(estimator=slr2,              # 10-fold cross validation
                            X=X,
                            y=y,
                            cv=cv,
                            scoring = 'neg_mean_squared_error',
                            n_jobs=1)
print('Nested MSE score:', scores.mean(), " +/- ", scores.std())
scores = cross_val_score(estimator=slr2,              # 10-fold cross validation
                            X=X,
                            y=y,
                            cv=cv,
                            scoring = 'neg_root_mean_squared_error',
                            n_jobs=1)
print('Nested RMSE score:', scores.mean(), " +/- ", scores.std())


Nested MSE score: -14625.269434917287  +/-  3023.477255848369
Nested RMSE score: -120.20176680115381  +/-  13.296792575592217


In [21]:
# Find the best subset of X variables using lasso regression:
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score

# Initialize Lasso regression model with cross-validation
lasso_model = LassoCV(cv=10, max_iter=10000)

# Fit the Lasso model to the training data
lasso_model.fit(X_train, y_train)

# Get selected feature indices (non-zero coefficients)
selected_feature_indices = np.where(lasso_model.coef_ != 0)[0]

# Get the names of the selected features
selected_feature_names = df_new.columns[selected_feature_indices]

# Extract the subset of features based on selected indices
X_subset_train = X_train[:, selected_feature_indices]

# Perform 10-fold cross-validation and calculate the mean R-squared score
scores = np.sqrt(-cross_val_score(lasso_model, X_subset_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1))
mean_score = scores.mean()

# Print the selected features and their names
print("Selected Features (Names):", selected_feature_names)
print("Number of Selected Features:", len(selected_feature_names))
print("Cross-Validation RMSE:", mean_score)

import pandas as pd

# Assuming df_new is your DataFrame
# To find the 1st quartile (25th percentile)
first_quartile = df_new['Freq'].quantile(0.25)

# To find the median (50th percentile)
median = df_new['Freq'].median()

# To find the 3rd quartile (75th percentile)
third_quartile = df_new['Freq'].quantile(0.75)

print("1st Quartile (25th percentile):", first_quartile)
print("Median (50th percentile):", median)
print("3rd Quartile (75th percentile):", third_quartile)



Selected Features (Names): Index(['source_a', 'source_c', 'source_d', 'source_e', 'source_m', 'source_o',
       'source_h', 'source_r', 'source_s', 'source_t', 'source_u', 'source_p',
       'source_w', 'Freq', 'Address_is_res', 'Purchase', 'Spending',
       'Last_Update_Month', 'First_Update_Year'],
      dtype='object')
Number of Selected Features: 19
Cross-Validation RMSE: 119.85637623211896
1st Quartile (25th percentile): 1.0
Median (50th percentile): 1.0
3rd Quartile (75th percentile): 2.0


## k-NN Regression

In [12]:
# Fit a k-NN Regression Model, evaluate w/ error
sc = MinMaxScaler(feature_range=(0, 1))
sc.fit(X_train)
x_train_scaled = sc.transform(X_train)
x_test_scaled = sc.transform(X_test)
x_sc = sc.transform(X)

# 3NN regressor
knn_regressor = neighbors.KNeighborsRegressor(n_neighbors=3)

# Fit model 
knn_regressor.fit(x_train_scaled, y_train)
pred = knn_regressor.predict(x_test_scaled)
error = sqrt(mean_squared_error(y_test, pred))
print('RMSE value is:', error)

RMSE value is: 155.73036129863598




In [13]:
# Use cross-validation with 10 folds to estimate the generalization performance
from sklearn.model_selection import cross_val_score
scores = cross_val_score(estimator=knn_regressor,              # 10-fold cross validation
                            X=x_sc,
                            y=y,
                            cv=cv,
                            scoring = 'neg_mean_squared_error',
                            n_jobs=1)
print('Nested MSE score:', scores.mean(), " +/- ", scores.std())
scores = cross_val_score(estimator=knn_regressor,              # 10-fold cross validation
                            X=x_sc,
                            y=y,
                            cv=cv,
                            scoring = 'neg_root_mean_squared_error',
                            n_jobs=1)
print('Nested RMSE score:', scores.mean(), " +/- ", scores.std())

Nested MSE score: -31159.730483492596  +/-  4037.365045001273
Nested RMSE score: -176.14055704667754  +/-  11.58596766691239


## Regressor Tree

In [14]:
# Fit a Regressor Tree model
tree = DecisionTreeRegressor(criterion='squared_error', max_depth=3, random_state=42)
tree.fit(X_train, y_train)

In [15]:
# Evaluate Regressor Tree model
scores = cross_val_score(tree,                                              # cross-validation scores
                         X,
                         y,
                         scoring = 'neg_root_mean_squared_error',
                         cv=cv)                                             # when an integer is specified StratifiedKFold is used
                                                                            # In StratifiedKFold, the folds are made by preserving the
                                                                            # percentage of samples for each class.

print("RMSE score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # estimate mean and 2*std from cross validation metrics

RMSE score: -121.15 (+/- 34.65)
