In [2]:
import glob
import random
import datetime
import os, fnmatch
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# from sklearn.externals import joblib

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.python.keras.models import model_from_json
from tensorflow.python.keras import optimizers


%matplotlib inline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
df = pd.read_csv("../dataset/dbBills_cleaned_1000.csv")

df = df.drop(df.columns[[0]], axis=1)
df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
df = df.fillna(0)

# df = df.sample(1000)
# df.to_csv("../dataset/dbBills_cleaned_1000.csv")

df.head()

Unnamed: 0,xCycleCode,xFamilyNum,xFaze,xAmper,xCounterBuldingNo,xRegionName_Roustaei,xRegionName_Shahri,xUsageGroupName_Keshavarzi,xUsageGroupName_Khanegi,xUsageGroupName_Omoomi,xUsageGroupName_Sanati,xUsageGroupName_Sayer,xBakhshCode_1,xBakhshCode_2,xBakhshCode_4,xTimeControlCode_1,xTimeControlCode_2,xTimeControlCode_3,xTariffOldCode_1010,xTariffOldCode_1011,xTariffOldCode_1110,xTariffOldCode_1111,xTariffOldCode_1990,xTariffOldCode_2110,xTariffOldCode_2210,xTariffOldCode_2310,xTariffOldCode_2410,xTariffOldCode_2510,xTariffOldCode_2610,xTariffOldCode_2710,xTariffOldCode_2990,xTariffOldCode_2992,xTariffOldCode_3110,xTariffOldCode_3210,xTariffOldCode_3310,xTariffOldCode_3410,xTariffOldCode_3520,xTariffOldCode_3540,xTariffOldCode_3740,xTariffOldCode_3991,xTariffOldCode_4410,xTariffOldCode_4610,xTariffOldCode_4990,xTariffOldCode_5110,xTariffOldCode_5990,days_difference,month,mediumDailyUsage,highDailyUsage,lowDailyUsage,xMeduimKw,xHighKw,xLowKw
0,6,1,1,25,76421105,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,9,16.428571,0.0,0.0,345,0,0
1,45,1,1,25,80202981,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,57,6,6.649123,0.0,0.0,379,0,0
2,7,1,1,25,890064710,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63,2,6.619048,2.396825,3.444444,417,151,217
3,6,1,1,25,291050952,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,11,5.218182,2.436364,3.8,287,134,209
4,36,1,1,25,76420930,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,58,2,11.224138,0.0,0.0,651,0,0


In [56]:
def create_model(target):

    features = ['xCycleCode', 'xFamilyNum', 'xFaze', 'xAmper', 'xRegionName_Roustaei',
                'xRegionName_Shahri', 'xUsageGroupName_Keshavarzi', 'xUsageGroupName_Khanegi',
                'xUsageGroupName_Omoomi', 'xUsageGroupName_Sanati', 'xUsageGroupName_Sayer',
                'xBakhshCode_1', 'xBakhshCode_2', 'xBakhshCode_4',
                'xTimeControlCode_1', 'xTimeControlCode_2', 'xTimeControlCode_3',
                'xTariffOldCode_1010', 'xTariffOldCode_1011', 'xTariffOldCode_1110',
                'xTariffOldCode_1111', 'xTariffOldCode_1990', 'xTariffOldCode_2110',
                'xTariffOldCode_2210', 'xTariffOldCode_2310', 'xTariffOldCode_2410',
                'xTariffOldCode_2510', 'xTariffOldCode_2610', 'xTariffOldCode_2710',
                'xTariffOldCode_2990', 'xTariffOldCode_2992', 'xTariffOldCode_3110',
                'xTariffOldCode_3210', 'xTariffOldCode_3310', 'xTariffOldCode_3410', 
                'xTariffOldCode_3520', 'xTariffOldCode_3540', 'xTariffOldCode_3740', 
                'xTariffOldCode_3991', 'xTariffOldCode_4410', 'xTariffOldCode_4610', 
                'xTariffOldCode_4990', 'xTariffOldCode_5110', 'xTariffOldCode_5990',
                'days_difference', 'month']

    X = df[features]
    X = np.matrix(X.values.tolist())
    y = df[target]
    
    y=y.values.reshape(-1,1)
    scaler = MinMaxScaler()
    
    print(scaler.fit(y))
    xscale=scaler.transform(X)
    yscale=scaler.transform(y)
    
    X_train, X_test, y_train, y_test = train_test_split(xscale, yscale)
        
    model = Sequential()
    model.add(Dense(20, input_dim=46, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='relu'))
    model.summary()
        
    model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])
    model.fit(X_train, y_train, epochs=150, batch_size=50,  verbose=0, validation_split=0.2)
    
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

    model_json = model.to_json()
    with open("{0}.json".format(target), "w") as json_file:
        json_file.write(model_json)
    model.save_weights("{0}.h5".format(target))
    print("Saved model to disk")

In [57]:
create_model("mediumDailyUsage")
create_model("highDailyUsage")
create_model("lowDailyUsage")

MinMaxScaler(copy=True, feature_range=(0, 1))
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 20)                940       
_________________________________________________________________
dense_26 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_27 (Dense)             (None, 20)                420       
_________________________________________________________________
dense_28 (Dense)             (None, 10)                210       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 11        
Total params: 2,001
Trainable params: 2,001
Non-trainable params: 0
_________________________________________________________________
mean_squared_error: 5.81%
Saved model to disk
MinMaxScaler(copy=True, feature_range=(0, 1))
__

In [58]:
df = df.sample(20)
df

Unnamed: 0,xCycleCode,xFamilyNum,xFaze,xAmper,xCounterBuldingNo,xRegionName_Roustaei,xRegionName_Shahri,xUsageGroupName_Keshavarzi,xUsageGroupName_Khanegi,xUsageGroupName_Omoomi,xUsageGroupName_Sanati,xUsageGroupName_Sayer,xBakhshCode_1,xBakhshCode_2,xBakhshCode_4,xTimeControlCode_1,xTimeControlCode_2,xTimeControlCode_3,xTariffOldCode_1010,xTariffOldCode_1011,xTariffOldCode_1110,xTariffOldCode_1111,xTariffOldCode_1990,xTariffOldCode_2110,xTariffOldCode_2210,xTariffOldCode_2310,xTariffOldCode_2410,xTariffOldCode_2510,xTariffOldCode_2610,xTariffOldCode_2710,xTariffOldCode_2990,xTariffOldCode_2992,xTariffOldCode_3110,xTariffOldCode_3210,xTariffOldCode_3310,xTariffOldCode_3410,xTariffOldCode_3520,xTariffOldCode_3540,xTariffOldCode_3740,xTariffOldCode_3991,xTariffOldCode_4410,xTariffOldCode_4610,xTariffOldCode_4990,xTariffOldCode_5110,xTariffOldCode_5990,days_difference,month,mediumDailyUsage,highDailyUsage,lowDailyUsage,xMeduimKw,xHighKw,xLowKw
597,33,1,1,15,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,69,3,17.0,0.0,0.0,1173,0,0
518,7,1,1,15,488210803,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58,10,0.137931,0.241379,0.396552,8,14,23
33,37,1,1,15,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,174,6,3.350575,0.0,0.0,583,0,0
778,8,1,1,25,2219420095237,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,57,4,3.122807,1.666667,2.982456,178,95,170
318,43,1,1,25,76657069,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62,8,2.532258,0.0,0.0,157,0,0
35,32,1,1,25,62694460,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,63,3,0.47619,0.174603,0.031746,30,11,2
85,11,1,1,25,78011404,0,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,56,7,4.339286,0.0,0.0,243,0,0
131,8,1,1,25,30088905,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,64,12,5.359375,2.3125,4.109375,343,148,263
577,15,1,1,25,2219220582116,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,61,9,1.704918,0.95082,1.229508,104,58,75
89,7,1,1,25,790098127,0,1,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63,8,0.0,0.015873,0.0,0,1,0


In [4]:
features = ['xCycleCode', 'xFamilyNum', 'xFaze', 'xAmper', 'xRegionName_Roustaei',
            'xRegionName_Shahri', 'xUsageGroupName_Keshavarzi', 'xUsageGroupName_Khanegi',
            'xUsageGroupName_Omoomi', 'xUsageGroupName_Sanati', 'xUsageGroupName_Sayer',
            'xBakhshCode_1', 'xBakhshCode_2', 'xBakhshCode_4',
            'xTimeControlCode_1', 'xTimeControlCode_2', 'xTimeControlCode_3',
            'xTariffOldCode_1010', 'xTariffOldCode_1011', 'xTariffOldCode_1110',
            'xTariffOldCode_1111', 'xTariffOldCode_1990', 'xTariffOldCode_2110',
            'xTariffOldCode_2210', 'xTariffOldCode_2310', 'xTariffOldCode_2410',
            'xTariffOldCode_2510', 'xTariffOldCode_2610', 'xTariffOldCode_2710',
            'xTariffOldCode_2990', 'xTariffOldCode_2992', 'xTariffOldCode_3110',
            'xTariffOldCode_3210', 'xTariffOldCode_3310', 'xTariffOldCode_3410', 
            'xTariffOldCode_3520', 'xTariffOldCode_3540', 'xTariffOldCode_3740', 
            'xTariffOldCode_3991', 'xTariffOldCode_4410', 'xTariffOldCode_4610', 
            'xTariffOldCode_4990', 'xTariffOldCode_5110', 'xTariffOldCode_5990',
            'days_difference', 'month']
X = df[features]
X = np.matrix(X.values.tolist())
y_medium = df["mediumDailyUsage"]
y_high = df["highDailyUsage"]
y_low = df["lowDailyUsage"]

medium_json_file = open('mediumDailyUsage.json', 'r')
medium_loaded_model_json = medium_json_file.read()
medium_json_file.close()
medium_loaded_model = model_from_json(medium_loaded_model_json)

high_json_file = open('highDailyUsage.json', 'r')
high_loaded_model_json = high_json_file.read()
high_json_file.close()
high_loaded_model = model_from_json(high_loaded_model_json)

low_json_file = open('lowDailyUsage.json', 'r')
low_loaded_model_json = low_json_file.read()
low_json_file.close()
low_loaded_model = model_from_json(low_loaded_model_json)

medium_loaded_model.load_weights("mediumDailyUsage.h5")
high_loaded_model.load_weights("highDailyUsage.h5")
low_loaded_model.load_weights("lowDailyUsage.h5")

print("Loaded model from disk")


print(medium_loaded_model.predict(X))
medium_loaded_model.compile(loss='mse', optimizer='adam', metrics=['mse','mae'])

scores = medium_loaded_model.evaluate(X, y_medium, verbose=0)
print("%s: %.2f%%" % (medium_loaded_model.metrics_names[1], scores[1]*100))

Loaded model from disk
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]

In [5]:
from sklearn.ensemble import GradientBoostingRegressor

xTrain, xTest, yTrain, yTest = train_test_split(X, y_medium, test_size = 0.1, random_state = 0)

est = GradientBoostingRegressor(n_estimators=600, learning_rate=0.1,
                                max_depth=3, random_state=0, loss='ls').fit(xTrain, yTrain)

print(mean_squared_error(yTest, est.predict(xTest)))

print(yTest)
print(est.predict(xTest))

21.19469646177769
993     6.473684
859     3.211538
298    12.000000
553     7.000000
672     4.983871
971     3.116667
27      0.000000
231     6.708861
306     0.060606
706     4.961538
496     8.595041
558     2.245614
784     2.740741
239     4.584615
578     3.200000
55      1.879310
906    15.935484
175     3.000000
14      2.087719
77      2.760563
31      2.372881
481     6.709677
310     4.626016
311     0.159091
883     2.191176
788     2.734694
45      1.316667
103     0.461538
760    12.890411
1       6.649123
823     1.237288
710     1.407407
614     9.915254
790     7.042857
408     8.419355
736     6.086207
957     2.339623
366     5.203125
918     0.000000
267     8.652778
230     1.689655
996     7.741379
635     6.626506
698     0.040541
251     3.500000
783     9.030769
819     1.866667
141     4.016667
316     3.213115
587     2.328571
331    30.190476
295     0.470588
262     5.140845
432     3.485714
862     0.037736
582     2.078740
272     0.000000
270     3.596

In [24]:
from sklearn import tree
from sklearn.metrics import mean_squared_error

est = tree.DecisionTreeRegressor()
est.fit(xTrain, yTrain)

print(mean_squared_error(yTest, est.predict(xTest)))

print(yTest)
print(est.predict(xTest))

20.901576901429852
823     1.237288
174     0.947368
325     4.451220
464    10.451613
428     0.240741
890     8.176471
377     9.590164
139     5.226415
349     0.019231
266     2.031250
762     3.809524
291     2.939394
819     1.866667
167     4.857143
143     7.840000
11      5.054545
942     1.490566
18      3.114754
341    11.984375
634     6.383333
694     4.968254
491     2.205479
520     0.016129
470     4.066667
735     3.100000
98      3.117647
538     3.566667
645     5.019231
811     2.680000
613     0.036364
870     5.116667
255     1.962963
184     0.000000
982     6.982143
901    15.283333
866     9.267857
738    10.736842
381     7.111111
303     4.723684
183     2.975000
225     4.685185
471     0.000000
631     3.324468
287     0.153846
771    13.036364
577     1.704918
965     2.686275
554     0.250000
60      7.835821
767     2.492537
800     1.600000
479     0.329412
973     7.037037
593     7.754098
483     1.866667
201     8.321429
189     0.000000
636     4.83

In [6]:
from sklearn import svm

xTrain, xTest, yTrain, yTest = train_test_split(X, y_medium, test_size = 0.01, random_state = 0)

clf = svm.SVR(C=12.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)
clf.fit(xTrain, yTrain)

print(mean_squared_error(yTest, clf.predict(xTest)))

print(yTest)
print(clf.predict(xTest))



12.315747772482847
993     6.473684
859     3.211538
298    12.000000
553     7.000000
672     4.983871
971     3.116667
27      0.000000
231     6.708861
306     0.060606
706     4.961538
Name: mediumDailyUsage, dtype: float64
[6.19891051 2.03836584 3.89939241 3.08228591 8.27936121 2.29945004
 1.74326604 5.18590276 4.83778253 5.97614411]
