In [1]:
import pandas as pd
import os
import functions
import psycopg2

In [None]:
# delete rows in wind_data_final where wind_speed is null
postgres_password = os.environ['POSTGRES_PASS']
con = psycopg2.connect(
    database='lhl_capstone_project',
    user='postgres',
    password=f'{postgres_password}',
    host='localhost',
    port='5432'
)
con.rollback()
cursor = con.cursor()

sql = f"""
DELETE
FROM wind_data_final
WHERE wind_speed IS NULL;
"""

cursor.execute(sql)
con.commit()
print("Successfully deleted rows where wind_speed is null")

In [2]:
# delete rows in solar_data_final where solar_radiation is null
postgres_password = os.environ['POSTGRES_PASS']
con = psycopg2.connect(
    database='lhl_capstone_project',
    user='postgres',
    password=f'{postgres_password}',
    host='localhost',
    port='5432'
)
con.rollback()
cursor = con.cursor()

sql = f"""
DELETE
FROM solar_data_final
WHERE solar_radiation IS NULL;
"""

cursor.execute(sql)
con.commit()
print("Successfully deleted rows where solar_radiation is null")

Successfully deleted rows where solar_radiation is null


In [None]:
# check for negative values in solar_radiation
postgres_password = os.environ['POSTGRES_PASS']
con = psycopg2.connect(
    database='lhl_capstone_project',
    user='postgres',
    password=f'{postgres_password}',
    host='localhost',
    port='5432'
)
con.rollback()
cursor = con.cursor()

sql = f"""
DELETE
FROM solar_data_final
WHERE solar_radiation < 0;
"""

cursor.execute(sql)
con.commit()
print("Successfully deleted rows where solar_radiation is null")

Check for rows where solar_radiation is negative:
```
SELECT COUNT(*)
FROM solar_data_final
WHERE solar_radiation < 0;
```

Output: 58129

In [3]:
# drop rows where solar_radiation is negative
postgres_password = os.environ['POSTGRES_PASS']
con = psycopg2.connect(
    database='lhl_capstone_project',
    user='postgres',
    password=f'{postgres_password}',
    host='localhost',
    port='5432'
)
con.rollback()
cursor = con.cursor()

sql = f"""
DELETE
FROM solar_data_final
WHERE solar_radiation < 0;
"""

cursor.execute(sql)
con.commit()
print("Successfully deleted rows where solar_radiation is negative")

Successfully deleted rows where solar_radiation is null


Check for rows where wind_speed is negative:
```
SELECT COUNT(*)
FROM wind_data_final
WHERE wind_speed < 0;
```

Output: 39243

In [4]:
# drop rows where wind_speed is negative
postgres_password = os.environ['POSTGRES_PASS']
con = psycopg2.connect(
    database='lhl_capstone_project',
    user='postgres',
    password=f'{postgres_password}',
    host='localhost',
    port='5432'
)
con.rollback()
cursor = con.cursor()

sql = f"""
DELETE
FROM wind_data_final
WHERE wind_speed < 0;
"""

cursor.execute(sql)
con.commit()
print("Successfully deleted rows where wind_speed is negative")

Successfully deleted rows where wind_speed is negative


### Examine column stats to inform null value handling for air_pressure, air_temperature, and relative_humidity

In [2]:
for column_name in ['air_pressure', 'air_temperature', 'relative_humidity']:
    stats = functions.max_min_avg('wind_data_final', column_name)
    print(f"{column_name} max, min, avg: {stats}")

Fetching results for air_pressure...
air_pressure max, min, avg: (1323.0, -10000.0, 879.7074028176639)
Fetching results for air_temperature...
air_temperature max, min, avg: (99.6, -7999.0, 7.627473827299756)
Fetching results for relative_humidity...
relative_humidity max, min, avg: (950.0, -8190.0, 72.48144837430874)


Count rows where values exceed reasonable ranges:
- relative_humidity is less than zero or greater than 100
- air_temperature is greater than 50 or less than -60
    - highest recorded temperature ever in Canada was 49.6 C (Lytton, BC, June 29 2021)
    - lowest recorded temperature in Canada was -63.0 C (Takhini, YK, Feb 3 1947)
- air_pressure is greater than 1100 or less than 900
    - 1071.9 mb highest recorded (Whitehorse, YK, Feb 1989)
    - 940.2 mb lowest recorded (St Anthony, NL, March 13 2022)

SQL queries:

```
SELECT COUNT(air_pressure)
FROM wind_data_final
WHERE air_pressure > 1100 OR air_pressure < 900
;

SELECT COUNT(air_temperature)
FROM wind_data_final
WHERE air_temperature > 50 OR air_temperature < -60
;

SELECT COUNT(relative_humidity)
FROM wind_data_final
WHERE relative_humidity > 100 OR relative_humidity < 0
;

SELECT COUNT(*)
FROM wind_data_final
;
```

Counts:
- air_temperature: 25035
- air_pressure: 1863499
- relative_humidity: 943685
- total rows: 39981579

In [3]:
# drop rows where values are out of reasonable range:
postgres_password = os.environ['POSTGRES_PASS']
con = psycopg2.connect(
    database='lhl_capstone_project',
    user='postgres',
    password=f'{postgres_password}',
    host='localhost',
    port='5432'
)
con.rollback()
cursor = con.cursor()

sql = f"""
DELETE
FROM wind_data_final
WHERE (air_pressure > 1100 OR air_pressure < 900) 
    OR (air_temperature > 50 OR air_temperature < -60) 
    OR (relative_humidity > 100 OR relative_humidity < 0);
"""

cursor.execute(sql)
con.commit()
print("Successfully deleted rows where variables were out of reasonable range")

Successfully deleted rows where variables were out of reasonable range


Number of rows deleted: 2571092

In [4]:
# check refreshed max, min, avg values
for column_name in ['air_pressure', 'air_temperature', 'relative_humidity']:
    stats = functions.max_min_avg('wind_data_final', column_name)
    print(f"{column_name} max, min, avg: {stats}")

Fetching results for air_pressure...
air_pressure max, min, avg: (1100.0, 900.0, 949.014924843145)
Fetching results for air_temperature...
air_temperature max, min, avg: (49.98, -59.64, 8.066938445934394)
Fetching results for relative_humidity...
relative_humidity max, min, avg: (100.0, 0.0, 72.00040252533972)


*Note: these values are still rather suspicious, especially those for air_pressure. With more time, a more thorough investigation should occur, but in the interest of time, the approach to outlier values has been to remove them.*

### Fill null values in air_pressure, air_temperature, relative_humidity with averages

In [2]:
# count null values in each column
for column_name in ['air_pressure', 'air_temperature', 'relative_humidity']:
    null_count = functions.count_na('wind_data_final', column_name)
    print(f"Number of null values in {column_name}: {null_count}")

Counting null values for air_pressure...
Number of null values in air_pressure: 34482464
Counting null values for air_temperature...
Number of null values in air_temperature: 708658
Counting null values for relative_humidity...
Number of null values in relative_humidity: 1735228


In [2]:
for column_name in ['air_pressure', 'air_temperature', 'relative_humidity']:
    functions.fill_na_with_avg('wind_data_final', column_name)

Calculating average for air_pressure...
Filling null values for air_pressure...
Calculating average for air_temperature...
Filling null values for air_temperature...
Calculating average for relative_humidity...
Filling null values for relative_humidity...


In [3]:
# check again for null values
for column_name in ['air_pressure', 'air_temperature', 'relative_humidity']:
    null_count = functions.count_na('wind_data_final', column_name)
    print(f"Number of null values in {column_name}: {null_count}")

Counting null values for air_pressure...
Number of null values in air_pressure: 0
Counting null values for air_temperature...
Number of null values in air_temperature: 0
Counting null values for relative_humidity...
Number of null values in relative_humidity: 0


Export database tables to .csv, then begin visualizations in Tableau.