# Harmony Request Cost Estimation

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor
import xgboost

## Read Request Data

In [None]:
# df = pd.read_csv("./2000-requests-7-concurrent-users-sandbox.csv")
df = pd.read_csv("./266Requests-1-100-random-service-example-granules.csv")
# df = pd.read_csv("./30-requests.csv")

In [None]:
df.head(5)

## See How Correlated the Features Are

In [None]:
#  This function does the actual graphical plotting of the correlation matrix.  
def plot_corr(df, size=25):
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    cax = ax.matshow(corr, cmap=plt.cm.RdYlGn, norm=colors.Normalize(vmin=0, vmax=1))
    
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=70)
    plt.yticks(range(len(corr.columns)), corr.columns)
    fig.colorbar(cax)
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
        ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(20)
    fig.savefig('corr_mat.png', bbox_inches='tight')

In [None]:
df.corr()

In [None]:
# Correlation Plot
df_c = df.copy()
target = 'totalTime' 
df_c = df_c.drop_duplicates().dropna()
plot_corr(df_c)

## Create Training Input/Output Sets

In [None]:
X_train = df.iloc[:,1:].values
y_train = df.iloc[:,0].values

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1,12)
y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1,1)

In [None]:
y_train.mean()

## 3 Layer Linear Learner Model

In [None]:
input_size=12
output_size=1

In [None]:
model = nn.Sequential(
    nn.Linear(12 , 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32,1)
)


In [None]:
learning_rate = 0.0001
l = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr =learning_rate )

In [None]:
num_epochs = 20000

In [None]:
for epoch in range(num_epochs):
    #forward feed
    y_pred = model(X_train.requires_grad_())

    #calculate the loss
    loss= l(y_pred, y_train)

    #backward propagation: calculate gradients
    loss.backward()

    #update the weights
    optimizer.step()

    #clear out the gradients from the last step loss.backward()
    optimizer.zero_grad()
    
    if epoch % 1000 == 0:
        print('loss {}'.format(loss.item()))

### Test Our Model With Our Training Data (bleh)

In [None]:
predicted = model(X_train).detach().numpy()

### Plot Our Actual Values And Our Predicted Values To See How Well They Track

In [None]:
def plot_predicted_vs_actual(actual, predicted):
    plt.figure(figsize=(10,10))
    plt.scatter(actual, predicted, c='crimson')
    plt.ylabel('Predicted')
    plt.xlabel('Actual')
    plt.axis('equal')
    
    # Set logarithmic scale on the both variables
    plt.xscale("log")
    plt.yscale("log");
    p1 = max(max(predicted), max(actual.numpy()))
    p2 = min(min(predicted), min(actual.numpy()))
    plt.plot([p1, p2], [p1, p2], 'b-')

In [None]:
plot_predicted_vs_actual(y_train, predicted)

### Compute the Maxiumum Error (Max Error is About 5 Seconds When The Actual Value is Around 100 Seconds)

In [None]:
(predicted - y_train.numpy()).max()

### Feature Impact Chart

In [None]:
df_new = df_c.corr().sort_values([target], ascending = False)
df_new.drop(df_new.head(1).index, inplace=True)
df_new.drop(df_new.tail(1).index, inplace=True)
df_new['Index'] = df_new.index
chart_title = ("Feature Impact Chart - Correlation with Target (%s))" % target)
df_new.plot(x='Index', y=target, kind='bar', 
            title=chart_title,figsize=(17,10))

## XGBoost

In [None]:
model = XGBRegressor(n_estimators=1000, max_depth=10, eta=0.1, subsample=0.7, colsample_bytree=0.8)

### K-Fold cross validation

In [None]:
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train.detach(), y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [None]:
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

### Train

In [None]:
model.fit(X_train.detach(), y_train)

In [None]:
predicted = model.predict(X_train.detach())

### Plot Our Actual Values And Our Predicted Values To See How Well They Track

In [None]:
plot_predicted_vs_actual(y_train, predicted)