In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

In [2]:
import time
import numpy as np
import pandas as pd
from math import ceil
from scipy import linalg
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error as MAE, mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV,train_test_split as tts
from sklearn.datasets import make_spd_matrix
from scipy.interpolate import interp1d
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from matplotlib import pyplot

In [None]:
#!pip install -q pygam

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd /content/drive/MyDrive/Colab Notebooks/DATA 441

/content/drive/MyDrive/Colab Notebooks/DATA 441


In [5]:
from nadaraya_watson import NadarayaWatson, NadarayaWatsonCV

In [6]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/telecom_churn.csv')

In [7]:
X = data.loc[:, 'AccountWeeks':].values
y = data['Churn'].values

In [None]:
def get_natural_cubic_spline_model(x, y, minval=None, maxval=None, n_knots=None, knots=None):
    """
    Get a natural cubic spline model for the data.

    For the knots, give (a) `knots` (as an array) or (b) minval, maxval and n_knots.

    If the knots are not directly specified, the resulting knots are equally
    space within the *interior* of (max, min).  That is, the endpoints are
    *not* included as knots.

    Parameters
    ----------
    x: np.array of float
        The input data
    y: np.array of float
        The outpur data
    minval: float 
        Minimum of interval containing the knots.
    maxval: float 
        Maximum of the interval containing the knots.
    n_knots: positive integer 
        The number of knots to create.
    knots: array or list of floats 
        The knots.

    Returns
    --------
    model: a model object
        The returned model will have following method:
        - predict(x):
            x is a numpy array. This will return the predicted y-values.
    """

    if knots:
        spline = NaturalCubicSpline(knots=knots)
    else:
        spline = NaturalCubicSpline(max=maxval, min=minval, n_knots=n_knots)

    p = Pipeline([
        ('nat_cubic', spline),
        #('regression', LinearRegression(fit_intercept=True))
    ])

    p.fit(x, y)

    return p


class AbstractSpline(BaseEstimator, TransformerMixin):
    """Base class for all spline basis expansions."""

    def __init__(self, max=None, min=None, n_knots=None, n_params=None, knots=None):
        if knots is None:
            if not n_knots:
                n_knots = self._compute_n_knots(n_params)
            knots = np.linspace(min, max, num=(n_knots + 2))[1:-1]
            max, min = np.max(knots), np.min(knots)
        self.knots = np.asarray(knots)

    @property
    def n_knots(self):
        return len(self.knots)

    def fit(self, *args, **kwargs):
        return self


class NaturalCubicSpline(AbstractSpline):
    """Apply a natural cubic basis expansion to an array.
    The features created with this basis expansion can be used to fit a
    piecewise cubic function under the constraint that the fitted curve is
    linear *outside* the range of the knots..  The fitted curve is continuously
    differentiable to the second order at all of the knots.
    This transformer can be created in two ways:
      - By specifying the maximum, minimum, and number of knots.
      - By specifying the cutpoints directly.  

    If the knots are not directly specified, the resulting knots are equally
    space within the *interior* of (max, min).  That is, the endpoints are
    *not* included as knots.
    Parameters
    ----------
    min: float 
        Minimum of interval containing the knots.
    max: float 
        Maximum of the interval containing the knots.
    n_knots: positive integer 
        The number of knots to create.
    knots: array or list of floats 
        The knots.
    """

    def _compute_n_knots(self, n_params):
        return n_params

    @property
    def n_params(self):
        return self.n_knots - 1

    def transform(self, X, **transform_params):
        X_spl = self._transform_array(X)
        if isinstance(X, pd.Series):
            col_names = self._make_names(X)
            X_spl = pd.DataFrame(X_spl, columns=col_names, index=X.index)
        return X_spl

    def _make_names(self, X):
        first_name = "{}_spline_linear".format(X.name)
        rest_names = ["{}_spline_{}".format(X.name, idx)
                      for idx in range(self.n_knots - 2)]
        return [first_name] + rest_names

    def _transform_array(self, X, **transform_params):
        X = X.squeeze()
        try:
            X_spl = np.zeros((X.shape[0], self.n_knots - 1))
        except IndexError: # For arrays with only one element
            X_spl = np.zeros((1, self.n_knots - 1))
        X_spl[:, 0] = X.squeeze()

        def d(knot_idx, x):
            def ppart(t): return np.maximum(0, t)

            def cube(t): return t*t*t
            numerator = (cube(ppart(x - self.knots[knot_idx]))
                         - cube(ppart(x - self.knots[self.n_knots - 1])))
            denominator = self.knots[self.n_knots - 1] - self.knots[knot_idx]
            return numerator / denominator

        for i in range(0, self.n_knots - 2):
            X_spl[:, i+1] = (d(i, X) - d(self.n_knots - 2, X)).squeeze()
        return X_spl

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=1234)

In [None]:
maes = []
for i in range(X.shape[1]):
    model = get_natural_cubic_spline_model(X[:,i], y, minval=np.min(X[:,i]), maxval=np.max(X[:,i]), n_knots=40)
    yhat_test = model.predict(X_test[:,i])
    curr_mae = MAE(y_test,yhat_test)
    maes.append(curr_mae)

In [None]:
1000*MAE(y,y_gam)

157.90562148774745

In [None]:
np.unique(y, return_counts=True)

(array([0, 1]), array([2850,  483]))

In [None]:
def DoKFold_GAM(X,y,k):
  PE = []
  by_feat = []
  acc = []
  pred_list = []
  kf = KFold(n_splits=k,shuffle=True) #,random_state=1234)
  for idxtrain, idxtest in kf.split(X):
    X_train = X[idxtrain,:]
    y_train = y[idxtrain]
    X_test  = X[idxtest,:]
    y_test  = y[idxtest]
    curr_feat = []
    yhats = []
    # here, we are getting a cubic spline for each feature
    for i in range(X.shape[1]):
      model = get_natural_cubic_spline_model(X[:,i], y, minval=np.min(X[:,i]), maxval=np.max(X[:,i]), n_knots=60)
      yhat_test = model.predict(X_test[:,i])
      curr_mae = MAE(y_test,yhat_test)
      curr_feat.append(curr_mae)
      PE.append(curr_mae)
      yhats.append(yhat_test)
    ## print(np.around(yhats))
    by_feat.append(np.mean(curr_feat))
    preds = np.around(np.mean(yhats, axis=0))
    pred_list.append(preds)
    print(np.unique(y_test, return_counts=True), np.unique(preds, return_counts=True))
    acc.append(100*(1 - (np.sum(np.abs(y_test - preds))/len(y_test))))
  # we then return the average error and the average by feature 
  return np.mean(PE), by_feat, acc, pred_list

In [None]:
mean_mae,mean_by_row,accs,pred_list = DoKFold_GAM(X,y,10)
print("Average:", str(mean_mae) + ',', str(np.mean(accs)) + '%')
for i in range(1,len(data.columns)-1):
  print(data.columns[i] + ": " + str(mean_by_row[i]) + ", " + str(accs[i]) + "%")

print("\nAverage Accuracy:", str(np.mean(accs)))

(array([0, 1]), array([292,  42])) (array([0.]), array([334]))
(array([0, 1]), array([290,  44])) (array([0.]), array([334]))
(array([0, 1]), array([284,  50])) (array([0.]), array([334]))
(array([0, 1]), array([286,  47])) (array([0.]), array([333]))
(array([0, 1]), array([286,  47])) (array([0.]), array([333]))
(array([0, 1]), array([283,  50])) (array([0.]), array([333]))
(array([0, 1]), array([274,  59])) (array([0.]), array([333]))
(array([0, 1]), array([283,  50])) (array([0.]), array([333]))
(array([0, 1]), array([289,  44])) (array([0.]), array([333]))
(array([0, 1]), array([283,  50])) (array([0.]), array([333]))
Average: 0.23405661508147407, 85.50772329215444%
AccountWeeks: 0.2262275329096474, 86.82634730538922%
ContractRenewal: 0.23821145535011606, 85.02994011976048%
DataPlan: 0.23041893105339156, 85.88588588588588%
DataUsage: 0.23006234604605558, 85.88588588588588%
CustServCalls: 0.23755585561463038, 84.98498498498499%
DayMins: 0.2561128598325427, 82.28228228228228%
DayCall

In [None]:
mean_mae,mean_by_row,accs,pred_list = DoKFold_GAM(X,y,10)
print("Average:", str(mean_mae) + ',', str(np.mean(accs)) + '%')
for i in range(1,len(data.columns)-1):
  print(data.columns[i] + ": " + str(mean_by_row[i]) + ", " + str(accs[i]) + "%")

print("\nAverage Accuracy:", str(np.mean(accs)))


(array([0, 1]), array([285,  49])) (array([0.]), array([334]))
(array([0, 1]), array([285,  49])) (array([0.]), array([334]))
(array([0, 1]), array([280,  54])) (array([0.]), array([334]))
(array([0, 1]), array([290,  43])) (array([0.]), array([333]))
(array([0, 1]), array([295,  38])) (array([0.]), array([333]))
(array([0, 1]), array([276,  57])) (array([0.]), array([333]))
(array([0, 1]), array([294,  39])) (array([0.]), array([333]))
(array([0, 1]), array([291,  42])) (array([0.]), array([333]))
(array([0, 1]), array([283,  50])) (array([0.]), array([333]))
(array([0, 1]), array([271,  62])) (array([0.]), array([333]))
Average: 0.23782657362314238, 85.50916185646726%
AccountWeeks: 0.24027763841341782, 85.32934131736528%
ContractRenewal: 0.24773735663294608, 83.8323353293413%
DataPlan: 0.2280105163612481, 87.08708708708708%
DataUsage: 0.2177041301375137, 88.58858858858859%
CustServCalls: 0.25481727329745, 82.88288288288288%
DayMins: 0.22019234696360276, 88.28828828828829%
DayCalls: 0

In [14]:
from sklearn.metrics.pairwise import haversine_distances
def DoKFold_NW(X,y,k):
  PE = []
  acc = []
  tests = []
  hats = []
  kf = KFold(n_splits=k,shuffle=True,random_state=1234)
  param_grid=dict(kernel=["polynomial"],gamma=np.logspace(-1, 1, 100))
  model = NadarayaWatsonCV(param_grid,scoring='neg_mean_absolute_error')
  for idxtrain, idxtest in kf.split(X):
    X_train = X[idxtrain,:]
    y_train = y[idxtrain]
    X_test  = X[idxtest,:]
    y_test  = y[idxtest]
    model.fit(X_train,y_train)
    yhat_test = model.predict(X_test)
    PE.append(MAE(y_test,yhat_test))
    tests.append(y_test)
    hats.append(yhat_test)
    acc.append(1 - (np.sum(np.abs(y_test - yhat_test))/len(y_test)))
  return acc, tests, hats

In [15]:
acc, tests, hats = DoKFold_NW(X,y,10)

In [16]:
all = []
for i in range(10):
  curr = 1-(np.sum(np.abs(tests[i] - np.around(hats[i])))/len(hats[i]))
  all.append(curr)
  print(str(curr))
print("Average:",str(np.mean(all)))

0.8532934131736527
0.8532934131736527
0.8383233532934131
0.8708708708708709
0.8858858858858859
0.8288288288288288
0.8828828828828829
0.8738738738738738
0.8498498498498499
0.8138138138138138
Average: 0.8550916185646725


In [None]:
np.mean(acc)

0.8277944067623991

In [None]:
np.unique(tests[0],return_counts=True),np.unique(np.around(hats[0]),return_counts=True)

((array([0, 1]), array([285,  49])), (array([0., 1.]), array([295,  39])))

In [None]:
np.abs(tests[0]-np.around(hats[0]))

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0.

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.