In [6]:
import glob
import random
import datetime
import os, fnmatch
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn import svm

import joblib

%matplotlib inline

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
df = pd.read_csv("../dataset/dbBills_cleaned.csv")

df = df.drop(df.columns[[0]], axis=1)
df = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]
df = df.fillna(0)

# df = df.sample(1000)
# df.to_csv("../dataset/dbBills_cleaned_1000.csv")

df.head()

Unnamed: 0,xSubscriptionId_fk,xCycleCode,xFamilyNum,xFaze,xAmper,xCounterBuldingNo,xRegionName_Roustaei,xRegionName_Shahri,xUsageGroupName_Keshavarzi,xUsageGroupName_Khanegi,xUsageGroupName_Omoomi,xUsageGroupName_Sanati,xUsageGroupName_Sayer,xBakhshCode_1,xBakhshCode_2,xBakhshCode_4,xTimeControlCode_1,xTimeControlCode_2,xTimeControlCode_3,xTariffOldCode_1010,xTariffOldCode_1011,xTariffOldCode_1110,xTariffOldCode_1111,xTariffOldCode_1990,xTariffOldCode_2110,xTariffOldCode_2210,xTariffOldCode_2310,xTariffOldCode_2410,xTariffOldCode_2510,xTariffOldCode_2610,xTariffOldCode_2710,xTariffOldCode_2990,xTariffOldCode_2992,xTariffOldCode_3110,xTariffOldCode_3210,xTariffOldCode_3310,xTariffOldCode_3410,xTariffOldCode_3520,xTariffOldCode_3540,xTariffOldCode_3740,xTariffOldCode_3991,xTariffOldCode_4410,xTariffOldCode_4610,xTariffOldCode_4990,xTariffOldCode_5110,xTariffOldCode_5990,days_difference,month,mediumDailyUsage,highDailyUsage,lowDailyUsage,xMeduimKw,xHighKw,xLowKw
0,9397665,25,1,3,25,12606909,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,8,38.428571,0.0,0.0,538,0,0
1,9396214,44,1,3,25,8336853341,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,21,8,95.190476,0.0,0.0,1999,0,0
2,9396214,44,1,3,25,8336853341,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,9,0.0,0.0,0.0,0,0,0
3,8952093,32,1,3,25,37337459,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,11,15.661765,0.0,0.0,1065,0,0
4,8952093,32,1,3,25,37337459,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62,2,9.032258,0.0,0.0,560,0,0


In [4]:
features = ['xCycleCode', 'xFamilyNum', 'xFaze', 'xAmper', 'xRegionName_Roustaei',
            'xRegionName_Shahri', 'xUsageGroupName_Keshavarzi', 'xUsageGroupName_Khanegi',
            'xUsageGroupName_Omoomi', 'xUsageGroupName_Sanati', 'xUsageGroupName_Sayer',
            'xBakhshCode_1', 'xBakhshCode_2', 'xBakhshCode_4',
            'xTimeControlCode_1', 'xTimeControlCode_2', 'xTimeControlCode_3',
            'xTariffOldCode_1010', 'xTariffOldCode_1011', 'xTariffOldCode_1110',
            'xTariffOldCode_1111', 'xTariffOldCode_1990', 'xTariffOldCode_2110',
            'xTariffOldCode_2210', 'xTariffOldCode_2310', 'xTariffOldCode_2410',
            'xTariffOldCode_2510', 'xTariffOldCode_2610', 'xTariffOldCode_2710',
            'xTariffOldCode_2990', 'xTariffOldCode_2992', 'xTariffOldCode_3110',
            'xTariffOldCode_3210', 'xTariffOldCode_3310', 'xTariffOldCode_3410', 
            'xTariffOldCode_3520', 'xTariffOldCode_3540', 'xTariffOldCode_3740', 
            'xTariffOldCode_3991', 'xTariffOldCode_4410', 'xTariffOldCode_4610', 
            'xTariffOldCode_4990', 'xTariffOldCode_5110', 'xTariffOldCode_5990',
            'days_difference', 'month']
X = df[features]
X = np.matrix(X.values.tolist())
y_medium = df["mediumDailyUsage"]
y_high = df["highDailyUsage"]
y_low = df["lowDailyUsage"]

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y_medium, test_size = 0.1, random_state = 0)

clf = svm.SVR(C=12.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, gamma='auto', verbose=False)
clf.fit(xTrain, yTrain)

print(mean_squared_error(yTest, clf.predict(xTest)))

joblib.dump(clf, '../models/' + 'SVR-Medium' + '.pkl')

# print(yTest)
# print(clf.predict(xTest))