In [26]:
## Importing Libraries ##
import pandas as pd
import matplotlib.pyplot as plt
import xlrd
import openpyxl
from datetime import datetime
from sklearn import *
import copy
import numpy as np
from numpy import nan
from scipy import stats

In [27]:
## Declaring variables ##
df = pd.read_csv('water_height_dataset.csv')
full_merged = pd.DataFrame()

In [28]:
for i in df.LOC_NAME.unique():

    ## Creating focused location ##
    test = df[df.LOC_NAME == str(i)]
    
    ## Creating the difference attribute ## 
    test['change'] = test.HT.diff()
    
    ## Dropping NA values ##
    test = test.dropna()
    
    ## Getting Z change score ##
    test['z_score_change'] = np.abs(stats.zscore(test['change']))
    
    ## Changing, and sorting values by datetime ##
    test.OBS_TIME_LOC = pd.to_datetime(test['OBS_TIME_LOC'])
    test.sort_values(by='OBS_TIME_LOC')
    
    ## Smoothing out the data ##
    cat = copy.copy(test)
    test.HT = test.HT.rolling(5).median()
    test.HT[0:4] = cat.HT[0:4]
    
    ## Merging all locations into one ##
    full_merged = pd.concat([full_merged,test],axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
## Dropping irrelvant attributes ##
full_merged = full_merged.drop(['VOID_FLAG','LOC','Unnamed: 0'],axis=1)

## Removing outliers ##
full_merged[full_merged.HT > 20] = np.nan
full_merged[full_merged.HT < 0] = np.nan

## Changing name to df ##
df = full_merged 

In [30]:
## Melting, and processing the data ##
df_concat = pd.DataFrame()

for i in df.LOC_NAME.unique():
    cf = df[df.LOC_NAME == str(i)]
    cf.OBS_TIME_LOC = pd.to_datetime(cf['OBS_TIME_LOC'])
    cf = cf.set_index('OBS_TIME_LOC')
    resampled = cf.resample('D').mean()
    df_concat = pd.concat([df_concat, resampled.HT.rename(str(i))], axis=1) 
    
df_concat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Unnamed: 0,The Pocket (Marshalls Creek),nan,Durrumbul (Brunswick River),Mullumbimby Creek (Mullumbimby Ck),Billinudgel (Marshalls Creek),Brunswick Heads (Tidal),Yelgun (Yelgun Creek),Orana Bridge (Marshalls Creek),Yelgun Creek (Helen St Bridge)
2005-11-26,1.668117,,0.653333,0.343636,0.320645,0.318966,,,
2005-11-27,1.594840,,0.651765,0.374583,0.290710,0.308333,,,
2005-11-28,1.513397,,0.752500,0.305735,0.305893,0.334118,,,
2005-11-29,1.547377,,0.665455,0.350526,0.343158,0.365370,,,
2005-11-30,1.614295,,0.646706,0.312857,0.375370,0.410000,,,
...,...,...,...,...,...,...,...,...,...
2022-07-28,0.875393,,0.797302,0.366667,0.371143,0.458660,2.261250,0.338870,0.429469
2022-07-29,0.865116,,0.777769,0.356667,0.374688,0.435212,2.258889,0.341039,0.427105
2022-07-30,0.860667,,0.761229,0.356000,0.360345,0.463385,2.250000,0.341725,0.417500
2022-07-31,0.855816,,0.749287,0.355000,0.377241,0.443635,2.250000,0.344037,0.420000


In [31]:
## Removing Na values ##
df_concat = df_concat.drop('nan',axis=1)
df_concat = df_concat.drop(['Yelgun (Yelgun Creek)','Orana Bridge (Marshalls Creek)','Yelgun Creek (Helen St Bridge)'], axis=1)
df = df_concat.dropna()

df

Unnamed: 0,The Pocket (Marshalls Creek),Durrumbul (Brunswick River),Mullumbimby Creek (Mullumbimby Ck),Billinudgel (Marshalls Creek),Brunswick Heads (Tidal)
2005-11-26,1.668117,0.653333,0.343636,0.320645,0.318966
2005-11-27,1.594840,0.651765,0.374583,0.290710,0.308333
2005-11-28,1.513397,0.752500,0.305735,0.305893,0.334118
2005-11-29,1.547377,0.665455,0.350526,0.343158,0.365370
2005-11-30,1.614295,0.646706,0.312857,0.375370,0.410000
...,...,...,...,...,...
2022-07-27,0.884857,0.822550,0.375455,0.377000,0.432415
2022-07-28,0.875393,0.797302,0.366667,0.371143,0.458660
2022-07-29,0.865116,0.777769,0.356667,0.374688,0.435212
2022-07-30,0.860667,0.761229,0.356000,0.360345,0.463385


In [32]:
## Exporting data to file ##
df.to_csv('cleaning_water_height_dataset.csv')