In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../pool_data_202211111622.csv')

In [3]:
df.head()

Unnamed: 0,pool_id,time_stamp,salinity,flow switch,flow meter,turbidity,ORP,PH,TDS,Pressure in tank,...,Shw msg,Drain,Human Counter,temperature,water level,cppm,phfeedtime,orpfeedtime,bather in from mens,bather in from ladies
0,10001,2022-11-01 11:01:16.220 -0400,287.175,,,9.5144,774.049,7.41573,287.175,,...,,0.0,0.0,74.2574,500.0,,,,,
1,10001,2022-11-01 11:01:26.323 -0400,287.175,,,9.35767,773.597,7.41573,287.175,,...,,0.0,0.0,74.2574,500.0,,,,,
2,10001,2022-11-01 11:01:36.386 -0400,287.121,,,9.35767,773.869,7.41889,287.121,,...,,0.0,0.0,74.2574,500.0,,,,,
3,10001,2022-11-01 11:01:46.498 -0400,287.248,,,8.57397,773.959,7.41193,287.248,,...,,0.0,0.0,74.2574,500.0,,,,,
4,10001,2022-11-01 11:01:56.554 -0400,287.175,,,8.88745,773.597,7.41699,287.157,,...,,0.0,0.0,74.2574,500.0,,,,,


In [5]:
df.columns

Index(['pool_id', 'time_stamp', 'salinity', 'flow switch', 'flow meter',
       'turbidity', 'ORP', 'PH', 'TDS', 'Pressure in tank', 'Pressure in',
       'Pressure out', 'Pump RPM', 'pump current', 'dis_feeder', 'ph_feeder',
       'Fresh water', 'Shw msg', 'Drain', 'Human Counter', 'temperature',
       'water level', 'cppm', 'phfeedtime', 'orpfeedtime',
       'bather in from mens', 'bather in from ladies'],
      dtype='object')

We want to remove some the columns we know aren't necessary.

The following columns are deprecated from the client:
- `flow switch`
- `flow meter`
- `Pressure in tank`
- `Shw msg`
- `phfeedtime`
- `orpfeedtime`

In [6]:
deprecated_col = ['flow switch', 'flow meter', 'Pressure in tank', 'Shw msg', 'phfeedtime', 'orpfeedtime']

df = df.drop(deprecated_col, axis=1)
df.head()

Unnamed: 0,pool_id,time_stamp,salinity,turbidity,ORP,PH,TDS,Pressure in,Pressure out,Pump RPM,...,dis_feeder,ph_feeder,Fresh water,Drain,Human Counter,temperature,water level,cppm,bather in from mens,bather in from ladies
0,10001,2022-11-01 11:01:16.220 -0400,287.175,9.5144,774.049,7.41573,287.175,2.53581,2.53346,,...,,,0.0,0.0,0.0,74.2574,500.0,,,
1,10001,2022-11-01 11:01:26.323 -0400,287.175,9.35767,773.597,7.41573,287.175,2.53508,2.53382,,...,,,0.0,0.0,0.0,74.2574,500.0,,,
2,10001,2022-11-01 11:01:36.386 -0400,287.121,9.35767,773.869,7.41889,287.121,2.53526,2.53382,,...,,,0.0,0.0,0.0,74.2574,500.0,,,
3,10001,2022-11-01 11:01:46.498 -0400,287.248,8.57397,773.959,7.41193,287.248,2.53545,2.53382,,...,,,0.0,0.0,0.0,74.2574,500.0,,,
4,10001,2022-11-01 11:01:56.554 -0400,287.175,8.88745,773.597,7.41699,287.157,2.53508,2.53382,,...,,,0.0,0.0,0.0,74.2574,500.0,,,


Lets tidy up our data. `cppm` is calculated from ORP and thus is not necessary. We also do not have data for `bather in from mens` and `bather in from ladies`. We will remove these as well then look at missing values.

In [7]:
unnecessary_data = ['cppm', 'bather in from mens', 'bather in from ladies']

df = df.drop(unnecessary_data, axis=1)
df.columns

Index(['pool_id', 'time_stamp', 'salinity', 'turbidity', 'ORP', 'PH', 'TDS',
       'Pressure in', 'Pressure out', 'Pump RPM', 'pump current', 'dis_feeder',
       'ph_feeder', 'Fresh water', 'Drain', 'Human Counter', 'temperature',
       'water level'],
      dtype='object')

In [8]:
df.isna().sum()

pool_id              0
time_stamp           0
salinity            31
turbidity           31
ORP                  0
PH                   0
TDS                 31
Pressure in         31
Pressure out        31
Pump RPM         43635
pump current        31
dis_feeder       43635
ph_feeder        43635
Fresh water         31
Drain               31
Human Counter       31
temperature          0
water level         31
dtype: int64

In [11]:
len(df)

43635

Comparing our missing data and the len of the dataframe, we see that the columns `Pump RPM`, `dis_feeder`, and `ph_feeder` are completely empty. We will drop those columns.

In [12]:
empty_col = ['Pump RPM', 'dis_feeder', 'ph_feeder']

df = df.drop(empty_col, axis=1)
df.columns

Index(['pool_id', 'time_stamp', 'salinity', 'turbidity', 'ORP', 'PH', 'TDS',
       'Pressure in', 'Pressure out', 'pump current', 'Fresh water', 'Drain',
       'Human Counter', 'temperature', 'water level'],
      dtype='object')