<a href="https://colab.research.google.com/github/mnoorchenar/SmartMeterData/blob/main/madushan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

Info_dic = {
    'SENSORID' : 'Unique meter indemnifier',
    'CHANTYPE' : 'Channel Type (1 = Register Reading, 2 = Interval Reading)',
    'READTS' : 'Read Timestamp',
    'VAL' : 'Value/Reading',
    'STATE' : 'Reading Sate (3 = Actual, 5 = Estimate)',
    'INTV' : 'Interval length (Example: 60 min, 15 min, 30 min )',
    'UOM' : 'UOM (6 = KWH)',
    'DIR' : 'Direction (1 = Delivered/Consumed, 2 = Received/Generation)'
}


df = pd.read_csv('https://raw.githubusercontent.com/mnoorchenar/data/main/Smart_Meter_Data/utilismart_dataset2.csv')

df.columns = df.columns.str.strip() #the .str.strip() method is used to remove any whitespace from the beginning and end of the column names
mask = df['SENSORID'].apply(lambda x: '-' not in x)
df = df[mask]
df = df.dropna() #This will drop the rows where all values are NaN.

# READTS – Read Timestamp
df['READTS'] = df['READTS'].replace({"-21 " : "-2021 ", "-22 " : "-2022 "}, regex = True)
df['READTS'] = df['READTS'].replace({"JAN":'01', "FEB":'02', "MAR":'03', "APR":'04', "MAY":'05', "JUN":'06', "JUL":'07', "AUG":'08', "SEP":'09', "OCT":'10', "NOV":'11', "DEC":'12'}, regex = True)
df['READTS'] = pd.to_datetime(df['READTS'], format='%d-%m-%Y %I.%M.%S.%f %p', errors='coerce')

df['VAL'] = pd.to_numeric(df['VAL'], errors='coerce')

# 'SENSORID: Unique meter indemnifier',
df['SENSORID'] = pd.to_numeric(df['SENSORID'], errors='coerce')
print(Info_dic['SENSORID'], ': \n', df['SENSORID'].value_counts())
df = df.dropna() #This will drop the rows where all values are NaN.

# 'CHANTYPE: Channel Type (1 = Register Reading, 2 = Interval Reading)',
df['CHANTYPE'] = df['CHANTYPE'].str.strip().astype(int)
print(Info_dic['CHANTYPE'], ': \n', df['CHANTYPE'].value_counts())
df = df.dropna() #This will drop the rows where all values are NaN.

# 'STATE' : 'Reading Sate (3 = Actual, 5 = Estimate)',
df['STATE'] = df['STATE'].str.strip().astype(int)
print(Info_dic['STATE'],': \n', df['STATE'].value_counts())
# ---------------------------------------------------------------------------------
# replace values of 8 and 7 to 5 in STATE
df['STATE'] = df['STATE'].replace([8, 7], 5)
print(Info_dic['STATE'],': \n', df['STATE'].value_counts())
df = df.dropna() #This will drop the rows where all values are NaN.

# DIR – Direction (1 = Delivered/Consumed, 2 = Received/Generation)
df['DIR'] = df['DIR'].str.strip().astype(int)
print(Info_dic['DIR'], ': \n',df['DIR'].value_counts())
# select rows where DIR == 1
df = df[df['DIR'] == 1]
print(Info_dic['DIR'], ': \n',df['DIR'].value_counts())

# UOM – UOM (6 = KWH)
df['UOM'] = df['UOM'].str.strip().astype(int)
print(Info_dic['UOM'], ': \n', df['UOM'].value_counts())
# select rows where UOM == 6
df = df[df['UOM'] == 6]
print(Info_dic['UOM'], ': \n',df['UOM'].value_counts())

df = df.drop(['DIR', 'UOM'], axis=1)

# # 'INTV: Interval length (Example: 60 min, 15 min, 30 min )',
# df['INTV'] = pd.to_numeric(df['INTV'], errors='coerce')
print(Info_dic['INTV'], ': \n', df['INTV'].value_counts())

df = df.dropna() #This will drop the rows where all values are NaN.

df.reset_index(inplace=True, drop=True)

df.head()


Unique meter indemnifier : 
 313960.0     54832
1017024.0    53508
1135981.0    36481
1050725.0    25881
1049869.0    19306
Name: SENSORID, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CHANTYPE'] = df['CHANTYPE'].str.strip().astype(int)


Channel Type (1 = Register Reading, 2 = Interval Reading) : 
 2    184978
1      5030
Name: CHANTYPE, dtype: int64
Reading Sate (3 = Actual, 5 = Estimate) : 
 3    186924
5      2510
8       571
7         3
Name: STATE, dtype: int64
Reading Sate (3 = Actual, 5 = Estimate) : 
 3    186924
5      3084
Name: STATE, dtype: int64
Direction (1 = Delivered/Consumed, 2 = Received/Generation) : 
 1    190008
Name: DIR, dtype: int64
Direction (1 = Delivered/Consumed, 2 = Received/Generation) : 
 1    190008
Name: DIR, dtype: int64
UOM (6 = KWH) : 
 6    190008
Name: UOM, dtype: int64
UOM (6 = KWH) : 
 6    190008
Name: UOM, dtype: int64
Interval length (Example: 60 min, 15 min, 30 min ) : 
         15    173492
        60     11486
         0      2614
                2416
Name: INTV, dtype: int64


Unnamed: 0,SENSORID,CHANTYPE,READTS,VAL,STATE,INTV
0,313960.0,2,2021-01-01 05:00:00,0.0205,3,15
1,313960.0,2,2021-01-01 05:15:00,0.02,3,15
2,313960.0,2,2021-01-01 05:30:00,0.02,3,15
3,313960.0,2,2021-01-01 05:45:00,0.0205,3,15
4,313960.0,2,2021-01-01 06:00:00,0.02,3,15


In [36]:
ids = pd.unique(df['SENSORID'])

df_subset = df[df['SENSORID']==ids[0]]
df_subset.head()

df_register = df_subset[df_subset['CHANTYPE']==1][['READTS', 'VAL']]
df_register.reset_index(inplace=True, drop=True)
df_interval = df_subset[df_subset['CHANTYPE']==2][['READTS', 'VAL']]
df_interval.reset_index(inplace=True, drop=True)

integer_check = df_register['VAL'].apply(lambda x: float(x).is_integer()).all()
if integer_check:
    print('Register Reading is integer')
else:
    print('Register Reading is Float')

Treshold = 1

# Drop duplicate rows based on all columns
df = df.drop_duplicates()

df_register  = df_register.drop_duplicates()
df_interval  = df_interval.drop_duplicates()

df_register.reset_index(inplace=True, drop=True)
df_register['VAL'] = df_register['VAL'] - df_register['VAL'][0]

df_interval.reset_index(inplace=True, drop=True)

Register Reading is integer


In [37]:
int_reg = pd.merge(df_interval, df_register, how = 'outer', on='READTS')

# Sort the DataFrame by the 'READTS' column in ascending order
int_reg = int_reg.sort_values('READTS')

# Calculate the cumulative sum of the 'score' column
int_reg['cumulative_sum'] = int_reg['VAL_x'].cumsum()

# Fill NaN values with the previous row's values
int_reg['cumulative_sum'] = int_reg['cumulative_sum'].fillna(method='ffill')

anomaly_check = int_reg['VAL_x'] - int_reg['cumulative_sum']

int_reg = pd.merge(int_reg[['READTS', 'VAL_y', 'cumulative_sum']], df_register['READTS'], how = 'inner', on='READTS')

# Define a dictionary that maps the old column names to the new column names
new_columns = {'VAL_y': 'Register_diff', 'IntervalSum_diff': 'years_old'}

# Rename the columns of the DataFrame using the dictionary
int_reg = int_reg.rename(columns=new_columns)

In [38]:
# Compute the difference between consecutive values in the 'value' column with a lag of 1
int_reg['Register_diff'] = int_reg['Register_diff'].diff(periods=1)
int_reg['cumulative_sum'] = int_reg['cumulative_sum'].diff(periods=1)
	
# Define a threshold value
threshold = 0.5

# Create a new column called 'anomaly' with values of 1 or 0 depending on whether the difference between 'age' and 'height' is greater than the threshold
int_reg['anomaly'] = np.where(abs(int_reg['Register_diff'] - int_reg['cumulative_sum']) > threshold, 1, 0)


In [40]:
sum(int_reg['anomaly'])

561

In [17]:
# Select rows where the 'score' column has a value of 1
anomalytime = set(int_reg[int_reg['anomaly'] == 1]['READTS']).intersection(df_register['READTS'])

# Replace the rows where the 'name' column is equal to the list with 1, and all other rows with 0
df_register['anomaly'] = df_register['READTS'].isin(anomalytime).astype(int)

# int_reg.to_csv('int_reg.csv')

In [18]:
anomalytime

{Timestamp('2021-10-26 03:18:00'),
 Timestamp('2021-06-03 04:34:00'),
 Timestamp('2022-01-15 13:00:00'),
 Timestamp('2022-02-18 03:18:00'),
 Timestamp('2022-01-10 13:15:00'),
 Timestamp('2022-04-18 19:00:00'),
 Timestamp('2022-02-19 12:30:00'),
 Timestamp('2022-06-06 03:17:00'),
 Timestamp('2021-10-08 00:45:00'),
 Timestamp('2021-06-25 16:00:00'),
 Timestamp('2021-11-16 12:45:00'),
 Timestamp('2021-08-28 04:27:00'),
 Timestamp('2021-03-12 00:15:00'),
 Timestamp('2021-04-07 12:45:00'),
 Timestamp('2022-01-07 15:27:00'),
 Timestamp('2021-09-02 16:43:00'),
 Timestamp('2021-10-04 07:45:00'),
 Timestamp('2021-10-05 00:45:00'),
 Timestamp('2022-04-19 09:18:00'),
 Timestamp('2021-10-02 00:45:00'),
 Timestamp('2022-01-06 13:15:00'),
 Timestamp('2022-04-03 18:30:00'),
 Timestamp('2021-11-03 18:30:00'),
 Timestamp('2022-06-28 12:45:00'),
 Timestamp('2021-11-27 03:15:00'),
 Timestamp('2022-04-17 13:30:00'),
 Timestamp('2022-04-01 08:00:00'),
 Timestamp('2022-01-13 00:30:00'),
 Timestamp('2021-11-

In [19]:
df_register

Unnamed: 0,READTS,VAL,anomaly
0,2021-01-01 12:15:00,0.0,0
1,2021-01-01 18:15:00,0.0,0
2,2021-01-02 00:15:00,1.0,0
3,2021-01-02 04:14:00,1.0,0
4,2021-01-02 06:00:00,1.0,0
...,...,...,...
2411,2022-06-30 07:45:00,1680.0,1
2412,2022-06-30 12:45:00,1681.0,1
2413,2022-06-30 18:30:00,1683.0,1
2414,2022-07-01 00:30:00,1685.0,1
