In [1]:
import glob
import random
import statistics

import joblib
import numpy as np
import pandas as pd

from sklearn import model_selection
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
df = pd.read_csv("../dataset/xAllChanges.csv").sample(2000)
# df = pd.read_csv("../dataset/xNoChange.csv")

df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,xSubscriptionId_fk,xCycleCode,xFamilyNum,xFaze,xAmper,xCounterBuldingNo,xRegionName_Roustaei,xRegionName_Shahri,xUsageGroupName_Keshavarzi,xUsageGroupName_Khanegi,xUsageGroupName_Omoomi,xUsageGroupName_Sanati,xUsageGroupName_Sayer,xBakhshCode_1,xBakhshCode_2,xBakhshCode_4,xTimeControlCode_1,xTimeControlCode_2,xTimeControlCode_3,xTariffOldCode_1010,xTariffOldCode_1011,xTariffOldCode_1110,xTariffOldCode_1111,xTariffOldCode_1990,xTariffOldCode_2110,xTariffOldCode_2210,xTariffOldCode_2310,xTariffOldCode_2410,xTariffOldCode_2510,xTariffOldCode_2610,xTariffOldCode_2710,xTariffOldCode_2990,xTariffOldCode_2992,xTariffOldCode_3110,xTariffOldCode_3210,xTariffOldCode_3310,xTariffOldCode_3410,xTariffOldCode_3520,xTariffOldCode_3540,xTariffOldCode_3740,xTariffOldCode_3991,xTariffOldCode_4410,xTariffOldCode_4610,xTariffOldCode_4990,xTariffOldCode_5110,xTariffOldCode_5990,days_difference,month,mediumDailyUsage,highDailyUsage,lowDailyUsage,xMeduimKw,xHighKw,xLowKw
390731,2400834,2400834,9090620,38,1,1,25,1219200273411,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,56,2,8.017857,3.892857,3.910714,449,218,219
366434,2214212,2214212,112616,9,1,1,25,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58,6,9.793103,0.0,0.0,568,0,0
779223,4887235,4887235,9355822,36,1,1,25,1374049,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,118,1,2.567797,1.025424,1.271186,303,121,150
394036,2436672,2436672,155424,17,1,1,15,688352333,0,1,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,9,0.0,0.0,0.0,0,0,0
796488,5019163,5019163,9545945,28,1,1,25,221919179049,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,61,5,3.213115,1.540984,2.95082,196,94,180


## Creating the Model

In [3]:
features = ['xCycleCode', 'xFamilyNum', 'xFaze', 'xAmper', 'xRegionName_Roustaei',
                'xRegionName_Shahri', 'xUsageGroupName_Keshavarzi', 'xUsageGroupName_Khanegi',
                'xUsageGroupName_Omoomi', 'xUsageGroupName_Sanati', 'xUsageGroupName_Sayer',
                'xBakhshCode_1', 'xBakhshCode_2', 'xBakhshCode_4',
                'xTimeControlCode_1', 'xTimeControlCode_2', 'xTimeControlCode_3',
                'xTariffOldCode_1010', 'xTariffOldCode_1011', 'xTariffOldCode_1110',
                'xTariffOldCode_1111', 'xTariffOldCode_1990', 'xTariffOldCode_2110',
                'xTariffOldCode_2210', 'xTariffOldCode_2310', 'xTariffOldCode_2410',
                'xTariffOldCode_2510', 'xTariffOldCode_2610', 'xTariffOldCode_2710',
                'xTariffOldCode_2990', 'xTariffOldCode_2992', 'xTariffOldCode_3110',
                'xTariffOldCode_3210', 'xTariffOldCode_3310', 'xTariffOldCode_3410', 
                'xTariffOldCode_3520', 'xTariffOldCode_3540', 'xTariffOldCode_3740', 
                'xTariffOldCode_3991', 'xTariffOldCode_4410', 'xTariffOldCode_4610', 
                'xTariffOldCode_4990', 'xTariffOldCode_5110', 'xTariffOldCode_5990',
                'days_difference', 'month']

X = df[features]
y_medium = df["mediumDailyUsage"]
y_high = df["highDailyUsage"]
y_low = df["lowDailyUsage"]

# X_train, X_test, y_train, y_test = train_test_split(X, y_medium)

## Selecting the algorithm considering 

In [4]:
rng = np.random.RandomState(1)
num_trees = 30

kfold = model_selection.KFold(n_splits=10, random_state=rng)

adaboost_model = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=num_trees, random_state=rng)

print("--------------Adaboost--------------")
results = model_selection.cross_val_score(adaboost_model, X, y_medium, cv=kfold)
print("Medium: " + str(results.mean()))

results = model_selection.cross_val_score(adaboost_model, X, y_high, cv=kfold)
print("High: " + str(results.mean()))

results = model_selection.cross_val_score(adaboost_model, X, y_low, cv=kfold)
print("Low: " + str(results.mean()))
print("------------------------------------")

--------------Adaboost--------------
Medium: -0.0011495527356686353
High: 0.1316509400658802
Low: -0.023577966349295387
------------------------------------


In [5]:
from sklearn import svm

clf = svm.SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, gamma='auto', verbose=False)

rng = np.random.RandomState(1)
kfold = model_selection.KFold(n_splits=10, random_state=rng)

print("--------------SVR--------------")
results = model_selection.cross_val_score(clf, X, y_medium, cv=kfold)
print("Medium: " + str(results.mean()))

results = model_selection.cross_val_score(clf, X, y_high, cv=kfold)
print("High: " + str(results.mean()))

results = model_selection.cross_val_score(clf, X, y_low, cv=kfold)
print("Low: " + str(results.mean()))
print("------------------------------------")

--------------SVR--------------
Medium: 0.009260377369548689
High: 0.12685526403904115
Low: 0.12361101331723576
------------------------------------


In [6]:
from sklearn import tree

est = tree.DecisionTreeRegressor()

rng = np.random.RandomState(1)
kfold = model_selection.KFold(n_splits=10, random_state=rng)

print("--------------Tree--------------")
results = model_selection.cross_val_score(est, X, y_medium, cv=kfold)
print("Medium: " + str(results.mean()))

results = model_selection.cross_val_score(est, X, y_high, cv=kfold)
print("High: " + str(results.mean()))

results = model_selection.cross_val_score(est, X, y_low, cv=kfold)
print("Low: " + str(results.mean()))
print("------------------------------------")

--------------Tree--------------
Medium: -0.3500713660118018
High: -0.6737906419781747
Low: -0.29870394047390736
------------------------------------


In [7]:
from sklearn.ensemble import GradientBoostingRegressor

est = GradientBoostingRegressor(n_estimators=600, learning_rate=0.1,
                                max_depth=3, random_state=0, loss='ls')

rng = np.random.RandomState(1)
kfold = model_selection.KFold(n_splits=10, random_state=rng)

print("--------------GradientBoostingRegressor--------------")
results = model_selection.cross_val_score(est, X, y_medium, cv=kfold)
print("Medium: " + str(results.mean()))

results = model_selection.cross_val_score(est, X, y_high, cv=kfold)
print("High: " + str(results.mean()))

results = model_selection.cross_val_score(est, X, y_low, cv=kfold)
print("Low: " + str(results.mean()))
print("-----------------------------------------------------")

--------------GradientBoostingRegressor--------------
Medium: -0.004907065428702095
High: 0.045472595283353234
Low: -0.15476365228009376
-----------------------------------------------------


## Feature Selection

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# model = LinearRegression()
model = np.random.RandomState(1)
model = AdaBoostRegressor(DecisionTreeRegressor(),
                          n_estimators=10 , random_state=rng)
#Initializing RFE model
rfe = RFE(model, 7)
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y_medium)  
#Fitting the data to model
model.fit(X_rfe,y_medium)
print(rfe.support_)
print(rfe.ranking_)

[ True False  True  True False False False False  True False False False
 False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False  True  True]
[ 1 19  1  1  7 16 17 11  1  8  6  4 12 13  1 30  5 15 27 22 25 38 24  9
 10 14 29 23 18 33 28 34 31 21 26 35 36 37 32  3 39 40  2 20  1  1]


In [9]:
features = ['xCycleCode', 'xFaze', 'xAmper', 
            'xUsageGroupName_Keshavarzi', 'xTimeControlCode_3',
            'days_difference', 'month']


X = df[features]

rng = np.random.RandomState(1)
num_trees = 30

kfold = model_selection.KFold(n_splits=10, random_state=rng)

adaboost_model = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=num_trees, random_state=rng)

print("--------------Adaboost--------------")
results = model_selection.cross_val_score(adaboost_model, X, y_medium, cv=kfold)
print("Medium: " + str(results.mean()))

results = model_selection.cross_val_score(adaboost_model, X, y_high, cv=kfold)
print("High: " + str(results.mean()))

results = model_selection.cross_val_score(adaboost_model, X, y_low, cv=kfold)
print("Low: " + str(results.mean()))
print("------------------------------------")

--------------Adaboost--------------
Medium: -0.3989262500115284
High: -0.21374297375862775
Low: -0.3385571612144118
------------------------------------


In [10]:
rng = np.random.RandomState(1)
regr = AdaBoostRegressor(DecisionTreeRegressor(),
                          n_estimators=10 , random_state=rng)

regr.fit(X, y_low)
joblib.dump(regr, '../models/ABR-NoChanges-Low.pkl')

print('Model has benn saved to disk successfully.')

Model has benn saved to disk successfully.


In [None]:
# df = pd.DataFrame(df['xSubscriptionId_fk'].unique(),columns= ['xSubscriptionId_fk'])
# df.to_csv("../dataset/xNoChange_xSubscriptionId_fk.csv")

## Serving the Model

In [4]:
all_medium_model = joblib.load('../models/ABR-AllChanges-Medium.pkl')
all_high_model = joblib.load('../models/ABR-AllChanges-High.pkl')
all_low_model = joblib.load('../models/ABR-AllChanges-Low.pkl')

no_medium_model = joblib.load('../models/ABR-NoChange-Medium.pkl')
no_high_model = joblib.load('../models/ABR-NoChange-High.pkl')
no_low_model = joblib.load('../models/ABR-NoChange-Low.pkl')

In [5]:
all_df = pd.read_csv('../dataset/xAllChanges.csv')
no_df = pd.read_csv('../dataset/xNoChange.csv')

In [6]:
all_change_ids = pd.read_csv('../dataset/xAllChanges_xSubscriptionId_fk.csv')
no_change_ids = pd.read_csv('../dataset/xNoChange_xSubscriptionId_fk.csv')

In [None]:
def predict(user_id, days, month):
    """
    Get predictions for a particular user.
    """
    try:
        medium_predicted = 0
        high_predicted = 0
        low_predicted = 0

        message = None
                                                
        if len(all_change_ids.loc[all_change_ids['xSubscriptionId_fk'] == user_id]) > 0:
            df = all_df.loc[all_df['xSubscriptionId_fk'] == user_id]
            df = df.iloc[-1:]
            df = df.drop(['xSubscriptionId_fk'], 1)
            df = df.drop(['xCounterBuldingNo'], 1)
            df = df.drop(df.columns[[0, 1]], 1).iloc[:, : 48]

            df['days_difference'] = days
            df['month'] = month
            
            if df['xTimeControlCode_1'].iloc[0] == 1:
                medium_predicted = all_medium_model.predict(df[features])[0] * days
            if df['xTimeControlCode_2'].iloc[0] == 1:
                medium_predicted = all_medium_model.predict(df[features])[0] * days
                high_predicted = all_high_model.predict(df[features])[0] * days
            if df['xTimeControlCode_3'].iloc[0] == 1:
                low_predicted = all_low_model.predict(df[features])[0] * days
                medium_predicted = all_medium_model.predict(df[features])[0] * days
                high_predicted = all_high_model.predict(df[features])[0] * days

        else:
            df = no_df.loc[no_df['xSubscriptionId_fk'] == user_id]
            df = df.iloc[-1:]
            df = df.drop(['xSubscriptionId_fk'], 1)
            df = df.drop(['xCounterBuldingNo'], 1)
            df = df.drop(df.columns[[0, 1]], 1).iloc[:, : 48]

            df['days_difference'] = days
            df['month'] = month
            
            if df['xTimeControlCode_1'].iloc[0] == 1:
                medium_predicted = no_medium_model.predict(df[features])[0] * days
            if df['xTimeControlCode_2'].iloc[0] == 1:
                medium_predicted = no_medium_model.predict(df[features])[0] * days
                high_predicted = no_high_model.predict(df[features])[0] * days
            if df['xTimeControlCode_3'].iloc[0] == 1:
                low_predicted = no_low_model.predict(df[features])[0] * days
                medium_predicted = no_medium_model.predict(df[features])[0] * days
                high_predicted = no_high_model.predict(df[features])[0] * days

        return {
            "success": True, 
            "message": message, 
            "prediction": {
                "medium_predicted": round(medium_predicted, 10), 
                "high_predicted": round(high_predicted, 10), 
                "low_predicted": round(low_predicted, 10) }
        }

    except Exception as ex:
        error_content = {
            "success": False,
            "message": "Hello! Error Happened.",
            "code": "PREDICTION_ERROR",
            "properties": {
                "code": 1000,
                "message": ex.__str__(),
            },
        }
        return error_content

In [8]:
predict(110402, 60, 6)

{'message': None,
 'prediction': {'high_predicted': 67.0,
  'low_predicted': 133.3333333333,
  'medium_predicted': 277.0},
 'success': True}

## Testing the Model

In [9]:
def random_User_selector(samples=5, mode='no-change'):
    random_user_ids = []
    if mode == 'all-changes':
        random_users = random.sample(list(all_change_ids['xSubscriptionId_fk']), samples)
    else:
        random_users = random.sample(list(no_change_ids['xSubscriptionId_fk']), samples)
        
    return random_users

In [10]:
def test(cases):
    for random_user_id in random_user_ids:
        try:
            df = pd.read_csv("../dataset/users_data/{0}.csv".format(random_user_id))
            
            prediction = predict(random_user_id, df.iloc[-1:]["days_difference"].iloc[-1], df.iloc[-1:]["month"].iloc[-1])
                
            if prediction["success"]:
                print('User: {0} Expected and Predicted values for month: {1} and days: {2} are:'.format(random_user_id, df.iloc[-1:]["month"].iloc[-1], df.iloc[-1:]["days_difference"].iloc[-1]))
                if prediction["message"]:
                    pass
                    print(prediction["message"])

                print('    Medium: {0}----->{1}'.format( df.iloc[-1:]["xMeduimKw"].iloc[-1], prediction["prediction"]["medium_predicted"]))
                print('    High: {0}------->{1}'.format( df.iloc[-1:]["xHighKw"].iloc[-1], prediction["prediction"]["high_predicted"]))
                print('    Low: {0}-------->{1}'.format( df.iloc[-1:]["xLowKw"].iloc[-1], prediction["prediction"]["low_predicted"]))
        except Exception as excp:
            print("CSV not found. \n {0}".format(excp.__str__()))

### Calculate error for random users who had changes

In [11]:
random_user_ids = random_User_selector(samples=5, mode='all-changes')
test(random_user_ids)

User: 8902291 Expected and Predicted values for month: 11 and days: 74 are:
    Medium: 147----->290.5
    High: 76------->133.0
    Low: 103-------->161.3333333333
User: 118390 Expected and Predicted values for month: 7 and days: 12 are:
    Medium: 90----->90.0
    High: 0------->0.0
    Low: 0-------->0.0
User: 9495876 Expected and Predicted values for month: 6 and days: 55 are:
    Medium: 288----->193.7608695652
    High: 146------->97.5348837209
    Low: 245-------->139.6071428571
User: 117920 Expected and Predicted values for month: 6 and days: 59 are:
    Medium: 591----->248.8666666667
    High: 124------->109.2307692308
    Low: 309-------->145.75
User: 129402 Expected and Predicted values for month: 7 and days: 65 are:
    Medium: 184----->184.0
    High: 76------->76.0
    Low: 106-------->106.0


### Calculate error for random users who did not have changes

In [None]:
random_user_ids = random_User_selector(samples=5, mode='no-change')
test(random_user_ids)

User: 203274 Expected and Predicted values for month: 3 and days: 170 are:
    Medium: 13----->44.5
    High: 3------->31.0
    Low: 8-------->48.0
User: 207718 Expected and Predicted values for month: 9 and days: 49 are:
    Medium: 155----->116.3333333333
    High: 56------->50.4074074074
    Low: 56-------->54.2380952381
User: 258628 Expected and Predicted values for month: 5 and days: 72 are:
    Medium: 126----->150.6263736264
    High: 59------->60.5050505051
    Low: 111-------->104.2769230769
User: 8768160 Expected and Predicted values for month: 4 and days: 58 are:
    Medium: 593----->589.3859649123
    High: 0------->0
    Low: 0-------->0
User: 251321 Expected and Predicted values for month: 2 and days: 60 are:
    Medium: 77----->119.6136363636
    High: 28------->49.8867924528
    Low: 52-------->77.5789473684


## The Subscriptions with multiple values in xFaze

In [None]:
random_user_ids = [110003, 110019, 110037, 110059, 110063]
test(random_user_ids)

User: 110003 Expected and Predicted values for month: 8 and days: 59 are:
    Medium: 298----->176.5904761905
    High: 139------->91.2380952381
    Low: 179-------->127.71875
User: 110019 Expected and Predicted values for month: 6 and days: 1180 are:
    Medium: 1880----->1880.0
    High: 0------->0
    Low: 0-------->0
User: 110037 Expected and Predicted values for month: 1 and days: 52 are:
    Medium: 178----->380.868852459
    High: 0------->0
    Low: 0-------->0
User: 110059 Expected and Predicted values for month: 12 and days: 59 are:
    Medium: 348----->533.2
    High: 0------->0
    Low: 0-------->0
User: 110063 Expected and Predicted values for month: 4 and days: 58 are:
    Medium: 171----->169.8571428571
    High: 69------->64.2142857143
    Low: 87-------->85.375


## The Subscriptions with multiple values in xAmper

In [None]:
random_user_ids = [110003, 110059, 110078, 110079, 110082]
test(random_user_ids)

User: 110003 Expected and Predicted values for month: 8 and days: 59 are:
    Medium: 298----->176.5904761905
    High: 139------->91.2380952381
    Low: 179-------->127.71875
User: 110059 Expected and Predicted values for month: 12 and days: 59 are:
    Medium: 348----->533.2
    High: 0------->0
    Low: 0-------->0
User: 110078 Expected and Predicted values for month: 2 and days: 57 are:
    Medium: 18----->185.0909090909
    High: 14------->61.0625
    Low: 15-------->97.5882352941
User: 110079 Expected and Predicted values for month: 9 and days: 24 are:
    Medium: 6----->6.0
    High: 0------->0.0
    Low: 0-------->0.0
User: 110082 Expected and Predicted values for month: 8 and days: 58 are:
    Medium: 294----->382.125
    High: 0------->0
    Low: 0-------->0


## The Subscriptions with multiple values in xCounterBuldingNo

In [None]:
random_user_ids = [110003, 110005, 110006, 110007, 110010]
test(random_user_ids)

User: 110003 Expected and Predicted values for month: 8 and days: 59 are:
    Medium: 298----->176.5904761905
    High: 139------->91.2380952381
    Low: 179-------->127.71875
User: 110005 Expected and Predicted values for month: 6 and days: 62 are:
    Medium: 511----->455.8709677419
    High: 0------->0
    Low: 0-------->0
User: 110006 Expected and Predicted values for month: 2 and days: 118 are:
    Medium: 0----->906.0
    High: 0------->0
    Low: 0-------->0
User: 110007 Expected and Predicted values for month: 6 and days: 60 are:
    Medium: 221----->188.09375
    High: 86------->91.768115942
    Low: 134-------->138.3103448276
User: 110010 Expected and Predicted values for month: 6 and days: 56 are:
    Medium: 90----->189.6724137931
    High: 49------->82.0526315789
    Low: 76-------->131.7868852459


## The overal error on different target variables

In [None]:
def error_calculator(expected, predicted):
    error = abs(expected - predicted) / (expected + 1)
    error_percentage = error * 100
    
    return error_percentage

In [None]:
def overal_test(cases):
    medium_error_less_or_equal_5 = list()
    medium_error_5_to_10 = list()
    medium_error_10_to_15 = list()
    medium_error_15_to_20 = list()
    medium_error_20_to_25 = list()
    medium_error_greater_than_25 = list()
    
    high_error_less_or_equal_5 = list()
    high_error_5_to_10 = list()
    high_error_10_to_15 = list()
    high_error_15_to_20 = list()
    high_error_20_to_25 = list()
    high_error_greater_than_25 = list()
    
    low_error_less_or_equal_5 = list()
    low_error_5_to_10 = list()
    low_error_10_to_15 = list()
    low_error_15_to_20 = list()
    low_error_20_to_25 = list()
    low_error_greater_than_25 = list()
    
    for random_user_id in cases:
        try:
            df = pd.read_csv("../dataset/users_data/{0}.csv".format(random_user_id))
            
            prediction = predict(random_user_id, df.iloc[-1:]["days_difference"].iloc[-1], df.iloc[-1:]["month"].iloc[-1])
            if prediction["success"]:
                medium_error = error_calculator(df.iloc[-1:]["xMeduimKw"].iloc[-1], prediction["prediction"]["medium_predicted"])
                if medium_error <= 5:
                    medium_error_less_or_equal_5.append(medium_error)
                elif 5 < medium_error <= 10:
                    medium_error_5_to_10.append(medium_error)
                elif 10 < medium_error <= 15:
                    medium_error_10_to_15.append(medium_error)
                elif 15 < medium_error <= 20:
                    medium_error_15_to_20.append(medium_error)
                elif 20 < medium_error <= 25:
                    medium_error_20_to_25.append(medium_error)
                else:
                    medium_error_greater_than_25.append(medium_error)
                    
                high_error = error_calculator(df.iloc[-1:]["xHighKw"].iloc[-1], prediction["prediction"]["high_predicted"])
                if high_error <= 5:
                    high_error_less_or_equal_5.append(high_error)
                elif 5 < high_error <= 10:
                    high_error_5_to_10.append(high_error)
                elif 10 < high_error <= 15:
                    high_error_10_to_15.append(high_error)
                elif 15 < medium_error <= 20:
                    high_error_15_to_20.append(high_error)
                elif 20 < medium_error <= 25:
                    high_error_20_to_25.append(high_error)
                else:
                    high_error_greater_than_25.append(high_error)
                    
                low_error = error_calculator(df.iloc[-1:]["xLowKw"].iloc[-1], prediction["prediction"]["low_predicted"])
                if low_error <= 5:
                    low_error_less_or_equal_5.append(low_error)
                elif 5 < low_error <= 10:
                    low_error_5_to_10.append(low_error)
                elif 10 < low_error <= 15:
                    low_error_10_to_15.append(low_error)
                elif 15 < medium_error <= 20:
                    low_error_15_to_20.append(low_error)
                elif 20 < medium_error <= 25:
                    low_error_20_to_25.append(low_error)
                else:
                    low_error_greater_than_25.append(low_error)
        except Exception as excp:
            print("CSV not found. \n {0}".format(excp.__str__()))
        
    print("We have {0} predictions with less than 5% error for medium".format(str(len(medium_error_less_or_equal_5))))
    print("We have {0} predictions with less than 5% to 10% error for medium".format(str(len(medium_error_5_to_10))))
    print("We have {0} predictions with less than 10% to 15% error for medium".format(str(len(medium_error_10_to_15))))
    print("We have {0} predictions with less than 15% to 20% error for medium".format(str(len(medium_error_15_to_20))))
    print("We have {0} predictions with less than 20% to 25% error for medium".format(str(len(medium_error_20_to_25))))
    print("We have {0} predictions with more than 25% error for medium".format(str(len(medium_error_greater_than_25))))
    print("-------------------------------------------------------")
    print("We have {0} predictions with less than 5% error for high".format(str(len(high_error_less_or_equal_5))))
    print("We have {0} predictions with less than 5% to 10% error for high".format(str(len(high_error_5_to_10))))
    print("We have {0} predictions with less than 10% to 15% error for high".format(str(len(high_error_10_to_15))))
    print("We have {0} predictions with less than 15% to 20% error for high".format(str(len(high_error_15_to_20))))
    print("We have {0} predictions with less than 20% to 25% error for high".format(str(len(high_error_20_to_25))))
    print("We have {0} predictions with more than 25% error for high".format(str(len(high_error_greater_than_25))))
    print("-------------------------------------------------------")
    print("We have {0} predictions with less than 5% error for low".format(str(len(low_error_less_or_equal_5))))
    print("We have {0} predictions with less than 5% to 10% error for low".format(str(len(low_error_5_to_10))))
    print("We have {0} predictions with less than 10% to 15% error for low".format(str(len(low_error_10_to_15))))
    print("We have {0} predictions with less than 15% to 20% error for low".format(str(len(low_error_15_to_20))))
    print("We have {0} predictions with less than 20% to 25% error for low".format(str(len(low_error_20_to_25))))
    print("We have {0} predictions with more than 25% error for low".format(str(len(low_error_greater_than_25))))
    print("=======================================================")
    medium_average = statistics.mean(medium_error_less_or_equal_5) + statistics.mean(medium_error_5_to_10) + statistics.mean(medium_error_10_to_15) + statistics.mean(medium_error_15_to_20) + statistics.mean(medium_error_20_to_25) + statistics.mean(medium_error_greater_than_25)
    print("Average error for medium is: {0}".format(str(medium_average/6)))
    high_average = statistics.mean(high_error_less_or_equal_5) + statistics.mean(high_error_5_to_10) + statistics.mean(high_error_10_to_15) + statistics.mean(high_error_15_to_20) + statistics.mean(high_error_20_to_25) + statistics.mean(high_error_greater_than_25)
    print("Average error for high is: {0}".format(str(high_average/6)))
    low_average = statistics.mean(low_error_less_or_equal_5) + statistics.mean(low_error_5_to_10) + statistics.mean(low_error_10_to_15) + statistics.mean(low_error_15_to_20) + statistics.mean(low_error_20_to_25) + statistics.mean(low_error_greater_than_25)
    print("Average error for low is: {0}".format(str(low_average/6)))
    print("=======================================================")
    print("Overall average error is: {0}".format(str((medium_average + high_average + low_average)/3)))

## AllChange Overal Errors

In [None]:
overal_test(list(all_change_ids['xSubscriptionId_fk']))

## NoChange Overal Errors

In [None]:
overal_test(list(no_change_ids['xSubscriptionId_fk']))