In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Colab Notebooks/DATA 441

/content/drive/MyDrive/Colab Notebooks/DATA 441


In [4]:
from nadaraya_watson import NadarayaWatson, NadarayaWatsonCV

In [5]:
scaler = StandardScaler()

In [6]:
cd ..

/content/drive/MyDrive/Colab Notebooks


In [7]:
data = pd.read_csv('data/telecom_churn.csv')

In [8]:
X = data.loc[:, 'AccountWeeks':].values
y = data['Churn'].values

In [11]:
polynomial_features= PolynomialFeatures(degree=3)
polynomial_features.fit(X)
x_poly = polynomial_features.transform(X)

## Feature Selection
### Lasso

In [14]:
sel_ = SelectFromModel(Lasso(alpha=0.01))
sel_.fit(scaler.fit_transform(x_poly), y)

In [15]:
x_poly_new = sel_.transform(x_poly)

In [16]:
x_poly_new.shape

(3333, 18)

In [17]:
features = sel_.get_feature_names_out()

In [18]:
features = [int(feature[1:]) for feature in features]

In [19]:
polynomial_features.get_feature_names_out()[features]

array(['x1', 'x5', 'x9', 'x1^2', 'x1 x5', 'x4^2', 'x0 x9^2', 'x1^3',
       'x1^2 x5', 'x1 x9^2', 'x2 x5^2', 'x3^3', 'x4^2 x6', 'x4 x5^2',
       'x5^3', 'x5^2 x8', 'x6 x8 x9', 'x9^3'], dtype=object)

# GAM Definitions

In [38]:
def get_natural_cubic_spline_model(x, y, minval=None, maxval=None, n_knots=None, knots=None):
    """
    Get a natural cubic spline model for the data.

    For the knots, give (a) `knots` (as an array) or (b) minval, maxval and n_knots.

    If the knots are not directly specified, the resulting knots are equally
    space within the *interior* of (max, min).  That is, the endpoints are
    *not* included as knots.

    Parameters
    ----------
    x: np.array of float
        The input data
    y: np.array of float
        The outpur data
    minval: float 
        Minimum of interval containing the knots.
    maxval: float 
        Maximum of the interval containing the knots.
    n_knots: positive integer 
        The number of knots to create.
    knots: array or list of floats 
        The knots.

    Returns
    --------
    model: a model object
        The returned model will have following method:
        - predict(x):
            x is a numpy array. This will return the predicted y-values.
    """

    if knots:
        spline = NaturalCubicSpline(knots=knots)
    else:
        spline = NaturalCubicSpline(max=maxval, min=minval, n_knots=n_knots)

    p = Pipeline([
        ('nat_cubic', spline),
        #('regression', LinearRegression(fit_intercept=True))
    ])

    p.fit(x, y)

    return p


class AbstractSpline(BaseEstimator, TransformerMixin):
    """Base class for all spline basis expansions."""

    def __init__(self, max=None, min=None, n_knots=None, n_params=None, knots=None):
        if knots is None:
            if not n_knots:
                n_knots = self._compute_n_knots(n_params)
            knots = np.linspace(min, max, num=(n_knots + 2))[1:-1]
            max, min = np.max(knots), np.min(knots)
        self.knots = np.asarray(knots)

    @property
    def n_knots(self):
        return len(self.knots)

    def fit(self, *args, **kwargs):
        return self


class NaturalCubicSpline(AbstractSpline):
    """Apply a natural cubic basis expansion to an array.
    The features created with this basis expansion can be used to fit a
    piecewise cubic function under the constraint that the fitted curve is
    linear *outside* the range of the knots..  The fitted curve is continuously
    differentiable to the second order at all of the knots.
    This transformer can be created in two ways:
      - By specifying the maximum, minimum, and number of knots.
      - By specifying the cutpoints directly.  

    If the knots are not directly specified, the resulting knots are equally
    space within the *interior* of (max, min).  That is, the endpoints are
    *not* included as knots.
    Parameters
    ----------
    min: float 
        Minimum of interval containing the knots.
    max: float 
        Maximum of the interval containing the knots.
    n_knots: positive integer 
        The number of knots to create.
    knots: array or list of floats 
        The knots.
    """

    def _compute_n_knots(self, n_params):
        return n_params

    @property
    def n_params(self):
        return self.n_knots - 1

    def transform(self, X, **transform_params):
        X_spl = self._transform_array(X)
        if isinstance(X, pd.Series):
            col_names = self._make_names(X)
            X_spl = pd.DataFrame(X_spl, columns=col_names, index=X.index)
        return X_spl

    def _make_names(self, X):
        first_name = "{}_spline_linear".format(X.name)
        rest_names = ["{}_spline_{}".format(X.name, idx)
                      for idx in range(self.n_knots - 2)]
        return [first_name] + rest_names

    def _transform_array(self, X, **transform_params):
        X = X.squeeze()
        try:
            X_spl = np.zeros((X.shape[0], self.n_knots - 1))
        except IndexError: # For arrays with only one element
            X_spl = np.zeros((1, self.n_knots - 1))
        X_spl[:, 0] = X.squeeze()

        def d(knot_idx, x):
            def ppart(t): return np.maximum(0, t)

            def cube(t): return t*t*t
            numerator = (cube(ppart(x - self.knots[knot_idx]))
                         - cube(ppart(x - self.knots[self.n_knots - 1])))
            denominator = self.knots[self.n_knots - 1] - self.knots[knot_idx]
            return numerator / denominator

        for i in range(0, self.n_knots - 2):
            X_spl[:, i+1] = (d(i, X) - d(self.n_knots - 2, X)).squeeze()
        return X_spl

In [39]:
class GAM1:
    def __init__(self, n_knots=40):
        self.n_knots = 40
    
    def fit(self, x, y):
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new, y_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        n_knots = self.n_knots
        residuals = []
        for i in range(x.shape[1]):
          model = get_natural_cubic_spline_model(x[:,i], y, minval=np.min(x[:,i]), maxval=np.max(x[:,i]), n_knots=n_knots)
          yhat_test = model.predict(x_new[:,i])
          residuals.append(y_new-yhat_test)
        #return np.sum(residuals, axis=0)
        return np.mean(residuals, axis=0)

    def get_params(self, deep=True):
        return {"n_knots": self.n_knots}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [40]:
class GAM2:
    def __init__(self, n_knots=40):
        self.n_knots = 40
    
    def fit(self, x, y):
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new, y_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        n_knots = self.n_knots
        residuals = []
        for i in range(x.shape[1]):
          model = get_natural_cubic_spline_model(x[:,i], y, minval=np.min(x[:,i]), maxval=np.max(x[:,i]), n_knots=n_knots)
          yhat_test = model.predict(x_new[:,i])
          residuals.append(y_new-yhat_test)
        #return np.sum(residuals, axis=0)
        return np.mean(residuals, axis=0)

    def get_params(self, deep=True):
        return {"n_knots": self.n_knots}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

# Other Definitions

In [27]:
def boosted(x, y, xnew, model1, model2, gam_first = False, gam_second = False, ynew = None):
  model1.fit(x,y)
  if gam_first:
    residuals1 = y - model1.predict(x,y)
  else:
    residuals1 = y - model1.predict(x)
  model2.fit(x,np.around(residuals1))
  if gam_first and gam_second:
    output = model1.predict(xnew,ynew) + model2.predict(xnew, ynew)
  elif gam_second:
    output = model1.predict(xnew) + model2.predict(xnew, ynew)
  elif gam_first:
    output = model1.predict(xnew, ynew) + model2.predict(xnew)
  else:
    output = model1.predict(xnew) + model2.predict(xnew)
  return output 

In [28]:
forest1 = RandomForestClassifier(max_depth=7, min_samples_leaf=2, min_samples_split=2, n_estimators=95)
gam1 = GAM1(40)
gam2 = GAM2(40)
nw = NadarayaWatsonCV(dict(kernel=["polynomial"],gamma=np.logspace(-1, 1, 100)),scoring='neg_mean_absolute_error')

In [None]:
# x_train, x_test, y_train, y_test = tts(X, y, test_size = 0.25, stratify=y)

# Testing

In [41]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

for idxtrain, idxtest in kf.split(x_poly_new):
  xtrain = x_poly_new[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x_poly_new[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)
  
  boost = np.around(boosted(xtrain, ytrain, xtest, gam1, forest1, True, False, ytest))

  mses.append(mse(ytest,boost))
  acc.append(accuracy_score(ytest, np.around(boost)))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

AttributeError: ignored

In [30]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  boost = np.around(boosted(xtrain, ytrain, xtest, gam1, nw, True, False, ytest))

  mses.append(mse(ytest,boost))
  acc.append(accuracy_score(ytest, np.around(boost)))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

AttributeError: ignored

In [31]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  boost = np.around(boosted(xtrain, ytrain, xtest, nw, gam2, False, True, ytest))

  mses.append(mse(ytest,boost))
  acc.append(accuracy_score(ytest, np.around(boost)))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

KeyboardInterrupt: ignored