### Standard Imports

In [2]:
%matplotlib inline

# disable warnings
import warnings
warnings.filterwarnings("ignore")

# data wrangling
import pandas as pd
import numpy as np
import itertools
import math
from random import randint
from datetime import datetime
from scipy import stats

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

### Linear Regression

In [3]:
# data exploration
import scipy.stats as stats

# data modeling
import statsmodels.api as sm
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error

In [4]:
def train_test_validation(X_train, X_test, y_train, y_test):
    if X_train.shape[0] == y_train.shape[0]:
        print("X & y train rows ARE equal")
    else:
        print("X & y train rows ARE NOT equal")
    if X_test.shape[0] == y_test.shape[0]:
        print("X & y test rows ARE equal")
    else:
        print("X & y test rows ARE NOT equal")
    if train.shape[1] == test.shape[1]:
        print("Number of columns in train & test ARE equal")
    else:
        print("Number of columns in train & test ARE NOT equal")
    train_split = train.shape[0] / (train.shape[0] + test.shape[0])
    test_split = test.shape[0] / (train.shape[0] + test.shape[0])
    print("Train Split: %.2f" % train_split)
    print("Test Split: %.2f" % test_split)

In [5]:
# ridge regression
from sklearn import linear_model

### Classification

In [6]:
# encoding and scaling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# train, test, split
from sklearn.model_selection import train_test_split

# data exploration
from pandas.tools.plotting import scatter_matrix
from matplotlib import cm
from matplotlib.ticker import FormatStrFormatter

#### Logistic Regression

In [7]:
# logistic regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#### Decision Tree

In [9]:
# decision tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import tree

# data modeling
import graphviz
from graphviz import Graph

# usage example:
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(DF.data, iris.target)

# dot_data = tree.export_graphviz(clf, out_file=None) 
# graph = graphviz.Source(dot_data) 
# graph.render('DF_tree', view=True)

#### Random Forest

In [10]:
# random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#### K-Nearest Neighbor

In [11]:
# k-nearest neighbor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# data modeling example:
# k_range = range(1, 20)
# scores = []
# for k in k_range:
#     knn = KNeighborsClassifier(n_neighbors = k)
#     knn.fit(X_train, y_train)
#     scores.append(knn.score(X_test, y_test))
# plt.figure()
# plt.xlabel('k')
# plt.ylabel('accuracy')
# plt.scatter(k_range, scores)
# plt.xticks([0,5,10,15,20])

#### Support Vector Machine

In [12]:
# svm
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

#### Voting Classifier

In [14]:
# VotingClassifier
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# usage example:
# Training classifiers
# clf1 = DecisionTreeClassifier(max_depth=4)
# clf2 = KNeighborsClassifier(n_neighbors=7)
# clf3 = SVC(gamma=.1, kernel='rbf', probability=True)
# eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
#                                     ('svc', clf3)],
#                         voting='soft', weights=[2, 1, 2])

# clf1.fit(X, y)
# clf2.fit(X, y)
# clf3.fit(X, y)
# eclf.fit(X, y)

# print('Accuracy of Decision Tree classifier on training set: {:.2f}'
#      .format(clf1.score(X, y)))

# print('Accuracy of KNN classifier on training set: {:.2f}'
#      .format(clf2.score(X, y)))

# print('Accuracy of SVC classifier on training set: {:.2f}'
#      .format(clf3.score(X, y)))

# print('Accuracy of a Voting Classifier using clf1, clf2, and clf3 on training set: {:.2f}'
#      .format(eclf.score(X, y)))

# plot decision boundaries
# f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))

# for idx, clf, tt in zip(product([0, 1], [0, 1]),
#                         [clf1, clf2, clf3, eclf],
#                         ['Decision Tree (depth=4)', 'KNN (k=7)',
#                          'Kernel SVM', 'Soft Voting']):

#     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
#     Z = Z.reshape(xx.shape)

#     axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
#     axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
#                                   s=20, edgecolor='k')
#     axarr[idx[0], idx[1]].set_title(tt)

# plt.show()

### Clustering

In [15]:
# clustering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [16]:
# filter columns and missing values
def remove_columns(df, cols_to_remove):  
    df = df.drop(columns=cols_to_remove)
    return df

def handle_missing_values(df, prop_required_column = .5, prop_required_row = .75):
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df

def data_prep(df, cols_to_remove=[], prop_required_column=.5, prop_required_row=.75):
    df = remove_columns(df, cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df

# detect and remove outliers
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)

    return df

# view outliers
# outlier_cols = [col for col in df if col.endswith('_outliers')]
# for col in outlier_cols:
#     print('~~~\n' + col)
#     data = df[col][df[col] > 0]
#     print(data.describe())

#### K-Means

In [17]:
# Visualizing
from matplotlib import cm

# Modeling
from scipy.spatial.distance import cdist
from scipy.cluster.vq import kmeans2, whiten
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from mpl_toolkits.mplot3d import Axes3D

In [18]:
# example 1 of visualization of clusters
# for i, (title, kmeans) in enumerate(estimators):
#     # fit the model
#     kmeans.fit(X)

#     labels = kmeans.labels_

#     # setup the 3d plot
#     fignum = i + 1
#     fig = plt.figure(fignum, figsize=(4, 3))
#     ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

#     # plot the points
#     ax.scatter(X.petal_width, X.sepal_length, X.petal_length,
#                c=labels.astype(np.float), edgecolor='k')

#     ax.w_xaxis.set_ticklabels([])
#     ax.w_yaxis.set_ticklabels([])
#     ax.w_zaxis.set_ticklabels([])

#     ax.set_xlabel('Petal Width')
#     ax.set_ylabel('Sepal Length')
#     ax.set_zlabel('Petal Length')

#     ax.set_title(title)
#     ax.dist = 12

# example 2 of visualization of clusters
# fig = plt.figure(fignum, figsize=(4, 3))
# ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

# # add species names
# for species in iris.species_name.unique():
#     df = iris[iris.species_name == species]
#     x = df.petal_width.mean()
#     y = df.sepal_length.mean()
#     z = df.petal_length.mean() + 2

#     ax.text3D(x, y, z, species,
#               horizontalalignment='center',
#               bbox=dict(alpha=.2, edgecolor='w', facecolor='w'))

# # add the actual data points
# ax.scatter(X.petal_width, X.sepal_length, X.petal_length, c=iris.species_id, edgecolor='k')

# ax.w_xaxis.set_ticklabels([])
# ax.w_yaxis.set_ticklabels([])
# ax.w_zaxis.set_ticklabels([])

# ax.set_xlabel('Petal width')
# ax.set_ylabel('Sepal length')
# ax.set_zlabel('Petal length')
# ax.set_title('Actual Species Clusters')
# ax.dist = 12

# fig

#### DBSCAN

In [19]:
# dbscan
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from scipy.cluster.vq import kmeans2, whiten

#### Hierarchical

In [21]:
# import hierarchical clustering libraries
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import fcluster

In [22]:
# fancy dendogram
# Annotating the distances inside the dendrogram
# def fancy_dendrogram(*args, **kwargs):
#     max_d = kwargs.pop('max_d', None)
#     if max_d and 'color_threshold' not in kwargs:
#         kwargs['color_threshold'] = max_d
#     annotate_above = kwargs.pop('annotate_above', 0)

#     ddata = dendrogram(*args, **kwargs)

#     if not kwargs.get('no_plot', False):
#         plt.title('Hierarchical Clustering Dendrogram (truncated)')
#         plt.xlabel('sample index or (cluster size)')
#         plt.ylabel('distance')
#         for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
#             x = 0.5 * sum(i[1:3])
#             y = d[1]
#             if y > annotate_above:
#                 plt.plot(x, y, 'o', c=c)
#                 plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
#                              textcoords='offset points',
#                              va='top', ha='center')
#         if max_d:
#             plt.axhline(y=max_d, c='k')
#     return ddata

# fancy_dendrogram(
#     Z,
#     truncate_mode='lastp',
#     p=12,
#     leaf_rotation=90.,
#     leaf_font_size=12.,
#     show_contracted=True,
#     annotate_above=10,  # useful in small plots so annotations don't overlap
# )
# plt.show()

### Time Series Analysis

#### Parametric Based

In [23]:
from datetime import timedelta
import statsmodels.api as sm
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AR
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics

In [24]:
# helpful plot and evaluate functions
# def plot_data_and_predictions(predictions, label):
#     plt.figure(figsize=(10, 8))

#     plt.plot(train,label='Train')
#     plt.plot(test, label='Test')
#     plt.plot(predictions, label=label, linewidth=5)

#     plt.legend(loc='best')
#     plt.show()

# def evaluate(actual, predictions, output=True):
#     mse = metrics.mean_squared_error(actual, predictions)
#     rmse = math.sqrt(mse)

#     if output:
#         print('MSE:  {}'.format(mse))
#         print('RMSE: {}'.format(rmse))
#     else:
#         return mse, rmse    

# def plot_and_eval(predictions, actual, metric_fmt='{:.2f}', linewidth=4):
#     if type(predictions) is not list:
#         predictions = [predictions]

#     plt.figure(figsize=(16, 8))
#     plt.plot(train,label='Train')
#     plt.plot(test, label='Test')

#     for yhat in predictions:
#         mse, rmse = evaluate(actual, yhat, output=False)        
#         label = f'{yhat.name}'
#         if len(predictions) > 1:
#             label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
#         plt.plot(yhat, label=label, linewidth=linewidth)

#     if len(predictions) == 1:
#         label = f'{label} -- MSE: {metric_fmt} RMSE: {metric_fmt}'.format(mse, rmse)
#         plt.title(label)

#     plt.legend(loc='best')
#     plt.show()    

#### Prophet

In [25]:
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation, performance_metrics

#### SVR

In [26]:
import statsmodels.api as sm
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit

#### SARIMAX

In [27]:
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import statsmodels.api as sm
from statsmodels.tsa.ar_model import AR
from sklearn import metrics

### Anomaly Detection

In [28]:
import matplotlib.dates as mdates
from sklearn import metrics
from random import randint
from matplotlib import style
from numpy import linspace, loadtxt, ones, convolve
from sklearn.ensemble import IsolationForest
import collections

In [29]:
# computing EMA
# Using Pandas to calculate a 2 hour span EMA. 
# adjust=False specifies that we are interested in the 
# recursive calculation mode.
# ema_short = train.ewm(span=12, adjust=False).mean()
# ema_short[0:3]

# ema_long = train.ewm(span=span, adjust=False).mean()
# ema_long[0:3]

# span = 24
# ema_long = train.ewm(span=span, adjust=False).mean()
# midband = ema_long[-1]
# ub = midband + ema_long[-24:-1].std()*3
# lb = midband - ema_long[-24:-1].std()*3

#### DBSCAN

In [30]:
import matplotlib.dates as mdates
from sklearn import metrics
from random import randint
from matplotlib import style
from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D

In [31]:
# helpful function to plot clf
def plot_clf(x, y, x_min, x_max, y_min, y_max, x_label, y_label):
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 25), np.linspace(y_min, y_max, 25))
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.title("IsolationForest")
    plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

    b1 = plt.scatter(train[x], train[y],
                     c='white',
                     s=20, 
                     edgecolor='k')
    b2 = plt.scatter(test[x], test[y], 
                     c='green',
                     s=20, 
                     edgecolor='k')
    b3 = plt.scatter(outlier[x], outlier[y],
                    c='red',
                    s=20,
                    edgecolor='k')
    plt.axis('tight')
    plt.xlim((x_min, x_max))
    plt.ylim((y_min, y_max))
    plt.legend([b1, b2, b3],
               ["half1",
                "half2",
               "post_cohort"],
               loc="best")
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    plt.show()
    
# example usage:
# plot_clf('days_normalized', 'page_viewed', 0, 400,\
#          0, 275, 'Days', 'Pages Viewed')
