In [None]:
from typing import Any
import sys
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import boto3
import json
import os
import traceback
import logging
import numpy as np
import warnings
from joblib import dump, load
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
#from xgboost import XGBRegressor
from sklearn.svm import SVR

SSM_CLIENT = boto3.client("ssm")
concatenated_data_frame = pd.DataFrame()
PARAM_NAME = "/google/admin/credentials"
SCOPE = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]

def get_google_credential_json():
    response = SSM_CLIENT.get_parameters(
        Names=[
            PARAM_NAME,
        ],
        WithDecryption=True
    )
    return json.loads(response['Parameters'][0]['Value'])

def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	"""
	Frame a time series as a supervised learning dataset.
	Arguments:
		data: Sequence of observations as a list or NumPy array.
		n_in: Number of lag observations as input (X).
		n_out: Number of observations as output (y).
		dropnan: Boolean whether or not to drop rows with NaN values.
	Returns:
		Pandas DataFrame of series framed for supervised learning.
	"""
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

def concatenate_google_spreadsheets(google_cloud_spreadsheets):
    index = 0
    flag = True
    while flag:
        try:
            if index:
                worksheet = google_cloud_spreadsheets.open(f"IFTTT_Maker_Webhooks_Events ({index})").sheet1
            else:
                worksheet = google_cloud_spreadsheets.open(f"IFTTT_Maker_Webhooks_Events").sheet1
        except Exception:
            flag = False
        else:
            data = pd.DataFrame(worksheet.get_all_values())
            global concatenated_data_frame
            concatenated_data_frame = pd.concat([concatenated_data_frame, data])
            index = index + 1


def clean_data(duration=300, peak=75):
    blood_glucose_dataset = concatenated_data_frame.copy()
    blood_glucose_dataset.drop(columns=[1,3,4,5,6],axis=1,inplace=True)
    blood_glucose_dataset.columns = ["DATETIME","BLOOD_GLUCOSE"]
    blood_glucose_dataset["BLOOD_GLUCOSE"] = blood_glucose_dataset["BLOOD_GLUCOSE"].replace(["LOW"], 2.2)
    blood_glucose_dataset["BLOOD_GLUCOSE"] = blood_glucose_dataset["BLOOD_GLUCOSE"].replace(["HIGH"], 20)
    blood_glucose_dataset["BLOOD_GLUCOSE"] = blood_glucose_dataset.BLOOD_GLUCOSE.astype("float64")
    blood_glucose_dataset["DATETIME"] = blood_glucose_dataset["DATETIME"].str.replace('at','')
    blood_glucose_dataset["DATETIME"] = pd.to_datetime(blood_glucose_dataset["DATETIME"])
    blood_glucose_dataset.replace(r'', np.NaN, inplace=True)
    blood_glucose_dataset.fillna(0,inplace=True)
    blood_glucose_time_series = downsample(dataset=blood_glucose_dataset,column="BLOOD_GLUCOSE", index="DATETIME")
    return blood_glucose_time_series

def downsample(dataset, index, column,  period = "30min"):
    dataset.set_index(index, inplace=True)
    time_series = dataset[column].resample(period).mean()
    return time_series


def create_gridsearch_model(hyperparameters, x,y, regressor, scaler=RobustScaler()):
    pipeline = make_pipeline(RobustScaler(), 
                         regressor)
    clf = GridSearchCV(pipeline, hyperparameters, cv=3)
    clf.fit(x, y)
    return clf

def handler(event, context):
    credentials = ServiceAccountCredentials.from_json_keyfile_dict(get_google_credential_json())
    google_cloud_spreadsheets = gspread.authorize(credentials)
    concatenate_google_spreadsheets(google_cloud_spreadsheets)
    
    data = clean_data().reset_index()
    data = data.BLOOD_GLUCOSE
    data = np.array(data)


    cleaned_dataset = series_to_supervised(data.tolist(),n_out=3)
    random_forest_hyperparameters = {'randomforestregressor__max_depth': [1,5,10,'None'],'randomforestregressor__max_features': [1,2,3,4,5,6,7,8,9,10,11],'randomforestregressor__max_samples': [10,20,30,40,50,60,70,80,90,100], 'randomforestregressor__n_estimators': [10,50,100,1000]}
    y = cleaned_dataset[["var1(t+2)"]]
    x = cleaned_dataset.drop(["var1(t+2)"],axis =1)
    #=======================================
    #
    # Random forest
    #
    #=======================================
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    random_forest_model = create_gridsearch_model(random_forest_hyperparameters,X_train,y_train, regressor=RandomForestRegressor())
    random_forest_score = random_forest_model.score(X_test, y_test)
    print(random_forest_model.best_params_)
    dump(random_forest_model, 'rf.joblib')
    print("RF",random_forest_score)
    #=======================================
    #
    # SVR
    #
    #=======================================
    support_vector_regressor_hyperparameters = {"svr__C": [1e0, 1e1, 1e2, 1e3,1e4,1e6],'svr__kernel':['linear','poly', 'rbf', 'sigmoid']}
    support_vector_regressor_model = create_gridsearch_model(support_vector_regressor_hyperparameters,X_train,y_train, regressor=SVR())
    print(support_vector_regressor_model.score(X_test, y_test))
    print("SVR",support_vector_regressor_model.best_params_)
    dump(support_vector_regressor_model, 'svr.joblib')
    #=======================================
    #
    # Linear regression
    #
    #=======================================
    linear_regression_model = LinearRegression().fit(X_train, y_train)
    linear_score = linear_regression_model.score(X_test, y_test)
    dump(linear_regression_model, 'linear.joblib')
    print("LR",linear_score)
    #=======================================
    #
    # XGBoost
    #
    #=======================================
    xgb_hyperparameters = {"xgbregressor__learning_rate"    : [0.0001, 0.001, 0.01, 0.1, 1.0] ,
 "xgbregressor__max_depth"        : [1,5,10,'None'],
 "xgbregressor__min_child_weight" : [0,1,5,10,20,50],
 "xgbregressor__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "xgbregressor__subsample" : [ 0.5 , 0.7 ,0.9,1.0] }
    xgb_regressor_model = create_gridsearch_model(xgb_hyperparameters, X_train, y_train, regressor=XGBRegressor())
    print(xgb_regressor_model.best_params_)
    print("XGB",xgb_regressor_model.score(X_test,y_test))
    dump(xgb_regressor_model, 'xgb.joblib')
handler(1,2)

KeyboardInterrupt: 