In [249]:
import pandas as pd
import datetime as dt

import plotly.express as px


df = pd.read_csv('PD_challange_data_set.csv')

def setUpTimeData(df):
    df['datetime'] = pd.to_datetime(df['time'])
    del df["time"]

    df['just_date'] = pd.to_datetime(df['datetime'].dt.date)
    df['just_time'] = df['datetime'].dt.time
    return df


time_data = setUpTimeData(df)
print(time_data.head())




   out_door_temp  electricity_usage            datetime  just_date just_time
0      33.630833             779.96 2016-12-30 01:30:00 2016-12-30  01:30:00
1      33.280833             804.26 2016-12-30 02:00:00 2016-12-30  02:00:00
2      33.003333             743.50 2016-12-30 02:30:00 2016-12-30  02:30:00
3      32.803333             703.86 2016-12-30 03:00:00 2016-12-30  03:00:00
4      32.555000             699.72 2016-12-30 03:30:00 2016-12-30  03:30:00


In [250]:
def sum_by_day(df):
    dfByDate = df.groupby(['just_date'], as_index=False).agg({'electricity_usage': ['sum'],'out_door_temp': ['mean']})
    dfByDate.reset_index()
    dfByDate.columns = dfByDate.columns.droplevel(1)   
    dfByDate['electricity_usage_sum_byDate'] = dfByDate['electricity_usage']
    del dfByDate['electricity_usage']
    dfByDate['out_door_temp_mean_byDate'] = dfByDate['out_door_temp']
    del dfByDate['out_door_temp']
    
    return dfByDate

def build_in_day_features(df):
#     datetime.strptime("25-01-1973", "%d-%m-%Y")
#     df['just_date'] = pd.to_datetime(df['just_date'])
    df['day_num'] = df['just_date'].dt.weekday
    df['week_num'] = df['just_date'].dt.strftime("%V")
    df['year'] = df['just_date'].dt.strftime("%Y")
    df['yr_and_week_num'] = df['year'].map(str) + '-' + df['week_num'].map(str)
#     df['yr_and_week_num'] = str(df['year']) + str(df['week_num'])
    

    return df

DF_byDay = sum_by_day(df)
DF_byDay = build_in_day_features(DF_byDay)

print(DF_byDay.head())

   just_date  electricity_usage_sum_byDate  out_door_temp_mean_byDate  \
0 2016-12-30                      51800.48                  32.649648   
1 2016-12-31                      26829.30                  32.186580   
2 2017-01-01                      39417.32                  36.468906   
3 2017-01-02                      41160.22                  36.076892   
4 2017-01-03                      60663.12                  40.284983   

   day_num week_num  year yr_and_week_num  
0        4       52  2016         2016-52  
1        5       52  2016         2016-52  
2        6       52  2017         2017-52  
3        0       01  2017         2017-01  
4        1       01  2017         2017-01  


In [251]:
def organize_data_byWeek(df): 
    # remove first incomplete week and group by week
    weekDF = df.groupby('yr_and_week_num',as_index=False).agg({'electricity_usage_sum_byDate': ['sum', 'mean','std'],'out_door_temp_mean_byDate': ['mean'] })
    #format subheaders
    weekDF['electricity_usage_sum_byWeek'] = weekDF[(
        'electricity_usage_sum_byDate', 'sum')]
    weekDF['electricity_usage_mean_byWeek'] = weekDF[(
        'electricity_usage_sum_byDate', 'mean')]
    weekDF['electricity_usage_std_byWeek'] = weekDF[(
        'electricity_usage_sum_byDate', 'std')]
    weekDF['out_door_temp_mean_byWeek'] = weekDF[(
        'out_door_temp_mean_byDate', 'mean')]

    del weekDF["electricity_usage_sum_byDate"]
    del weekDF["out_door_temp_mean_byDate"]

    weekDF.reset_index()
    weekDF.columns = weekDF.columns.droplevel(1)   
    
    return weekDF

DF_byWeek = organize_data_byWeek(DF_byDay)

DF_byWeek.head()

Unnamed: 0,yr_and_week_num,electricity_usage_sum_byWeek,electricity_usage_mean_byWeek,electricity_usage_std_byWeek,out_door_temp_mean_byWeek
0,2016-52,78629.78,39314.89,17657.290712,32.418114
1,2017-01,375624.18,53660.597143,9873.645809,29.387247
2,2017-02,398794.2,56970.6,9162.547693,33.535836
3,2017-03,384035.28,54862.182857,9224.738864,38.697274
4,2017-04,392219.68,56031.382857,9847.583099,35.600893


In [252]:
def integrate_week_dateRef(week_data,day_data):    
    day_data_byMondays = day_data[day_data['day_num'] == 0]
    day_data_byMondays_filtered = day_data_byMondays.filter(
        ['just_date', 'yr_and_week_num'])
    merged_week_df_withDateRef = pd.merge(day_data_byMondays_filtered, week_data,
                    on='yr_and_week_num', how='inner')
    
    merged_week_df_withDateRef['monday_of_that_week'] = merged_week_df_withDateRef['just_date']
    del merged_week_df_withDateRef['just_date']

    return merged_week_df_withDateRef

merged_week_df_withDateRef = integrate_week_dateRef(DF_byWeek,DF_byDay)

sorteddf = merged_week_df_withDateRef.sort_values(by=['monday_of_that_week'],ascending=False)
sorteddfDown = merged_week_df_withDateRef.sort_values(by=['monday_of_that_week'],ascending=True)

print(sorteddf.head())
print(sorteddfDown.head())


    yr_and_week_num  electricity_usage_sum_byWeek  \
53          2018-01                     456006.32   
104         2018-52                     320128.40   
103         2018-51                     377066.10   
102         2018-50                     384679.98   
101         2018-49                     387423.72   

     electricity_usage_mean_byWeek  electricity_usage_std_byWeek  \
53                    57000.790000                   4936.634144   
104                   45732.628571                   6616.239013   
103                   53866.585714                  10244.616410   
102                   54954.282857                  10338.425721   
101                   55346.245714                   9181.917409   

     out_door_temp_mean_byWeek monday_of_that_week  
53                   15.768867          2018-12-31  
104                  36.946181          2018-12-24  
103                  37.240585          2018-12-17  
102                  35.354789          2018-12-10  
101    

In [253]:
# print(weekday_DF_byDay['electricity_usage_sum_bydate'].head())

def merge_week_and_day_data(DF_byDay,DF_byWeek):
#     del DF_byWeek['monday_of_that_week']
    DF_byDay_with_weekdata = pd.merge(DF_byDay,DF_byWeek, on='yr_and_week_num')

    def if_wknd(df):
        if df.day_num > 4:
            return True
        else:
            return False

    DF_byDay_with_weekdata['wknd'] = DF_byDay_with_weekdata.apply(if_wknd,axis=1)
    
    weekend_DF_byTime = DF_byDay_with_weekdata[DF_byDay_with_weekdata['wknd']==True] 

    def if_wknd_or_holiday(df):
        local_wknd_df = weekend_DF_byTime[weekend_DF_byTime['yr_and_week_num']==df['yr_and_week_num']]
        localwkndmean = local_wknd_df['electricity_usage_sum_byDate'].mean()
        if df['wknd']:
            return True
        if float(df.electricity_usage_sum_byDate) < localwkndmean + float(df.electricity_usage_std_byWeek) * 1 :
            return True
        else:
            return False
    
           
    DF_byDay_with_weekdata['wknd_or_hldy'] = DF_byDay_with_weekdata.apply(if_wknd_or_holiday,axis=1)

         
    return DF_byDay_with_weekdata

DF_byDay_with_weekdata = merge_week_and_day_data(DF_byDay, merged_week_df_withDateRef)

# weekday_DF_byTime = DF_byDay_with_weekdata[DF_byDay_with_weekdata['wknd_or_hldy']==False] 
# weekend_DF_byTime = DF_byDay_with_weekdata[DF_byDay_with_weekdata['wknd_or_hldy']==True] 

# print(len(weekend_DF_byTime),weekend_DF_byTime[1:].head()) 


In [254]:
weekday_DF_byTime = DF_byDay_with_weekdata[DF_byDay_with_weekdata['wknd_or_hldy']==False] 
weekend_DF_byTime = DF_byDay_with_weekdata[DF_byDay_with_weekdata['wknd_or_hldy']==True] 



In [256]:
weekday_DF_byTime
testDate = weekday_DF_byTime.iloc[10]['just_date']
testDate2 = time_data.iloc[10]['just_date']

print(type(testDate),type(testDate2))
specific_day_data = time_data[time_data['just_date'] == testDate.to_pydatetime() ]
print(specific_day_data.head())




<class 'pandas._libs.tslib.Timestamp'> <class 'pandas._libs.tslib.Timestamp'>
     out_door_temp  electricity_usage            datetime  just_date just_time
573      38.370833             969.32 2017-01-11 00:00:00 2017-01-11  00:00:00
574      39.152500             969.24 2017-01-11 00:30:00 2017-01-11  00:30:00
575      40.502500             948.94 2017-01-11 01:00:00 2017-01-11  01:00:00
576      41.417500             929.80 2017-01-11 01:30:00 2017-01-11  01:30:00
577      41.867500             861.02 2017-01-11 02:00:00 2017-01-11  02:00:00


In [261]:
time_data.columns



Index(['out_door_temp', 'electricity_usage', 'datetime', 'just_date',
       'just_time'],
      dtype='object')

In [262]:

from scipy import stats
from datetime import datetime
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
%matplotlib inline


print(time_data.head())
time_data = time_data.dropna()
X = [time_data['out_door_temp'],time_data['just_time']]
y = time_data['electricity_usage']

x = X.reshape(len(X), 1)
y = y.reshape(len(y), 1)

model.fit(x, y)

# plot it as in the example at http://scikit-learn.org/
plt.scatter(x, y,  color='black')
plt.plot(x, regr.predict(x), color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()



model = LinearRegression()
scores = []
kfold = KFold(n_splits=3, shuffle=True, random_state=42)
for i, (train, test) in enumerate(kfold.split(X, y)):
 model.fit(X.iloc[train,:], y.iloc[train,:])
 score = model.score(X.iloc[test,:], y.iloc[test,:])
 scores.append(score)
print(scores)

print(model.predict(32.803333))

   out_door_temp  electricity_usage            datetime  just_date just_time
0      33.630833             779.96 2016-12-30 01:30:00 2016-12-30  01:30:00
1      33.280833             804.26 2016-12-30 02:00:00 2016-12-30  02:00:00
2      33.003333             743.50 2016-12-30 02:30:00 2016-12-30  02:30:00
3      32.803333             703.86 2016-12-30 03:00:00 2016-12-30  03:00:00
4      32.555000             699.72 2016-12-30 03:30:00 2016-12-30  03:30:00


AttributeError: 'list' object has no attribute 'reshape'

In [266]:
y = time_data.electricity_usage.values
X = time_data.drop(['electricity_usage','datetime','just_date'], axis=1).values

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=42)
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print(linreg.score(X_test, y_test))
# print(classification_report(y_test, y_pred))

NameError: name 'train_test_split' is not defined