In [2]:
# Import the neccessary packages
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder


In [3]:
# Read in the training data
df = pd.read_csv("Data/batch_2.csv")
df.head()

Unnamed: 0,Server-Locale,Server-Response-Time,Server-Current-Load,Last-Request-Time-Delta,Overloaded
0,Africa (AF),434,0,134.319055,False
1,Africa (AF),510,0,134.330613,False
2,Africa (AF),323,0,134.341006,False
3,Africa (AF),486,0,134.353713,False
4,Africa (AF),434,0,0.363997,False


In [33]:
# Automatically selects categorical columns and replaces them with one-hot encoded matrices
def encode_data(input_data, scope, exclude="None"):
    # Use select_dtypes to find all of the columns that are categorical data and not numerical data
    categorical_columns = input_data.select_dtypes(
        include=["object"]).columns.tolist()

    for column in categorical_columns:
        # Ignore certain columns like the model target column for example without needing to delete it from the dataset completely
        if column in exclude:
            continue

        # Find all the values in the columns so we can count all of the values
        values = input_data[column]

        # Count the frequency of the values, select the three largest by scope generosity, then extract the index name
        mask = pd.value_counts(values).nlargest(scope).index

        # Use get_dummies on the filtered categorical values using our mask above
        encoded_column = pd.get_dummies(pd.Categorical(
            values, categories=mask), dtype=np.int64)

        # Merge the new dummy columns into our dataset
        input_data = pd.merge(
            left=input_data,
            right=encoded_column,
            left_index=True,
            right_index=True,
        )

        # Drop the old categorical column
        input_data = input_data.drop(columns=column)

    return input_data

In [34]:
# Drop Server-Current-Load because it is expensive to make it available for each web-request directive
df.dropna(subset=["Server-Current-Load"], inplace=True)

# Transform the dataset with one-hot (dummy column) encoding
# TODO: When finished with locale implementation for countries, increase the scope
df = encode_data(df, scope=7, exclude="Overloaded")

# Target column needs to be encoded in place and not in two seperate columns
special_encode = LabelEncoder()
df["Overloaded"] = special_encode.fit_transform(df["Overloaded"])

df.head()


Unnamed: 0,Server-Response-Time,Server-Current-Load,Last-Request-Time-Delta,Overloaded,Antarctica (AN),Europe (EU),Asia (AS),North America (NA),Oceania (OC),South America (SA),Africa (AF)
0,434,0,134.319055,0,0,0,0,0,0,0,1
1,510,0,134.330613,0,0,0,0,0,0,0,1
2,323,0,134.341006,0,0,0,0,0,0,0,1
3,486,0,134.353713,0,0,0,0,0,0,0,1
4,434,0,0.363997,0,0,0,0,0,0,0,1


In [40]:
# Determine the input/output data
target = "Overloaded"

y = df[target]
X = df.drop(columns = target)

# Build the train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=1234)

# Build the model
model = GaussianNB()
model.fit(X_train, y_train)

# Test how good our Gaussian Naive-Bayes model is
y_pred = model.predict(X_test) 
print(f"Gaussian Naive-Bayes Model Accuracy: {accuracy_score(y_test, y_pred)}")

# The Round Robin approach generated the following output with a batch of 10k web-requests:

# Processing incoming web requests with round robin algorithm...
# Writing the training data...
# Average Load: 3.9545
# RR Number of Servers Overloaded: 2184
# RR Cumulative Response Time (minutes): 235.22

# So out of 10k web-requests, the server-clusters reported that they were overloaded 2184 times
# This makes about 21.84% of our web-requests slower, resulting in a higher cumulative response time

# However, our Gaussian Naive-Bayes Model Accuracy routed 10k web-requests to non-overloaded servers about 91.416% of the time
# This means that only 858 web requests experienced slower response time as a result of server-overload
# Future versions of this model will include the Cumulative Response Time attribute

Gaussian Naive-Bayes Model Accuracy: 0.9141666666666667


In [77]:
# Find the best server to route to based on the model
random_AF_cluster = df[90:95].loc[:, df.columns != target]
probabilities = model.predict_proba(random_server_cluster).tolist()

# The results of this are interesting
# It showcases the predicted health of all of the selected servers and how well-prepared they are to receive another web-request
# All of the servers are in Africa so a web-request coming from Africa would run through this logic to determine what server to go to next

# 0 means that the server is not overloaded and 1 means that it is overloaded
print(model.classes_)
healthy_servers = dict()

for index, item in enumerate(probabilities):
    # Add 90 because we are inspecting df[90:95]
    print(f"Sever-{str(index + 90)} Predicted State: | Healthy {round(item[0]*100, 4)}% | Unhealthy {round(item[1]*100, 4)}% ", end="")

    # Fix spacing issue in output
    if len(str(round(item[1]*100, 4))) < 7:
        print(" ", end="")

    print("|")
    
    # Keep track of the healthy servers for finding the best one
    healthy_servers[f"Sever-{str(index + 90)}"] = item[0]

print("-"*88)
print(f"The Server To Route The Next Request To: {max(healthy_servers.keys())}")

[0 1]
Sever-90 Predicted State: | Healthy 61.0568% | Unhealthy 38.9432% |
Sever-91 Predicted State: | Healthy 30.7821% | Unhealthy 69.2179% |
Sever-92 Predicted State: | Healthy 72.2682% | Unhealthy 27.7318% |
Sever-93 Predicted State: | Healthy 84.5929% | Unhealthy 15.4071% |
Sever-94 Predicted State: | Healthy 99.6046% | Unhealthy 0.3954%  |
----------------------------------------------------------------------------------------
The Server To Route The Next Request To: Sever-94
