In [1]:
# Importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score
from sklearn.utils import parallel_backend
from pyearth import Earth
from patsy import dmatrix
from sklearn.model_selection import GridSearchCV
import statsmodels.formula.api as smf
from sklearn.preprocessing import PolynomialFeatures
import warnings
from pyearth import Earth

In [2]:
# Loading data
X_test = pd.read_csv('data/X_test.csv')
X_train = pd.read_csv('data/X_train.csv')
y_test = pd.read_csv('data/y_test.csv')
y_train = pd.read_csv('data/y_train.csv')

In [3]:
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category = FutureWarning)

# Initializing optimal parameters & best score variables
opt_degree = 1
opt_max_terms = 500
best_score = -float('inf')

# Outer loop for degree
for degree in range(1, 10):
    # Creating a MARS model with the current degree and max_terms
    model = Earth(max_terms = 500, max_degree = degree)
    
    # Inner loop for max_terms
    for max_terms in [500, 750, 1000, 1250, 1500]:
        # Setting the current max_terms
        model.max_terms = max_terms
        
        # 5-fold cross validation
        scores = cross_val_score(model, X_train, y_train, cv = 5, scoring='neg_mean_squared_error')
        
        # Computing mean score
        mean_score = scores.mean()
        
        # Checking if mean score is better than the current best score
        if mean_score > best_score:
            best_score = mean_score
            opt_degree = degree
            opt_max_terms = max_terms
            
# Training a new MARS model on the entire training set using the optimal parameters
model = Earth(max_terms = opt_max_terms, max_degree = opt_degree)
model.fit(X_train, y_train)

# Evaluating the model on the test set
rmse_optimal = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
print("Optimal degree:", opt_degree)
print("Optimal max_terms:", opt_max_terms)


In [None]:
mars_model = Earth(max_terms = 22, max_degree = 100)
mars_model.fit(X_train, y_train)

In [None]:
print(mars_model.summary())

In [None]:
print("Train RMSE:", np.sqrt(mean_squared_error(y_train, mars_model.predict(X_train))))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, mars_model.predict(X_test))))