# Multivariate Adaptive Regression Splines (MARS)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# yfinance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.85,3.98,3.84,3.95,3.95,20548400
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700


In [3]:
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-08-20,19.790001,20.08,19.35,19.98,19.98,62983200
2018-08-21,19.98,20.42,19.860001,20.4,20.4,55629000
2018-08-22,20.280001,20.92,20.209999,20.9,20.9,62002700
2018-08-23,21.190001,22.32,21.139999,22.290001,22.290001,113444100
2018-08-24,22.91,24.0,22.67,23.98,23.98,164328200


In [4]:
X = dataset.iloc[ : , 1].values
y = dataset.iloc[ : , 4].values

In [5]:
print(X.shape)
print(y.shape)

(1171,)
(1171,)


In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.datasets import make_regression
from pyearth import Earth

In [7]:
X, y = make_regression(n_samples=5000, n_features=15, n_informative=10,
                       noise=0.5, random_state=5)

In [8]:
# Define the model
model = Earth()

In [9]:
# Specify cross-validation method to use to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate model performance
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

# Print results
np.mean(scores)

-1.7453459182899151

In [10]:
model.fit(X,y)
model.score(X, y)

0.9998139174188514

In [11]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (3750, 15) (3750,)
Test set: (1250, 15) (1250,)


In [12]:
#evaluate model performance
scores = cross_val_score(model, X_test, y_test, scoring='neg_mean_absolute_error',
                         cv=cv, n_jobs=-1)

#print results
np.mean(scores)

-1.7459352924277576

In [13]:
model.score(X_test, y_test)

0.9998138350524507