# Intro to Data Management with Python
# Sketch an analysis for Solar Power Generation Dataset

Kritkorn Supyen (Data Enginering, Jacobs University Bremen)

# 1. Initialize Data

In [1]:
import mysql.connector
import datamanagement as dm
import re
import json

## 1.1 Connect to MySQL server

In [2]:
# Use a JSON file to hide MySQL credentials a little bit.
parameters_file = open('MySQLparameter.json')
parameters_value = json.load(parameters_file)
parameters_file.close()

db_connection, mycursor = dm.connectServer(**parameters_value)

## 1.2 Create database

Create a database named PowerGeneration

In [3]:
# Create Database
database_name = 'PowerGeneration'
print(dm.createDatabase(mycursor, database_name))

Command excuted - CREATE DATABASE PowerGeneration;


## 1.3 Use database

Use the database PowerGeneration

In [4]:
# Use Database
print(dm.useDatabase(mycursor, database_name))

Command excuted - USE PowerGeneration


## 1.4 Create tables

Create 4 tables for 4 csv files.
- Plant_1_Generation_Data.csv
- Plant_1_Weather_Sensor_Data.csv
- Plant_2_Generation_Data.csv
- Plant_2_Weather_Sensor_Data.csv.

In [5]:
# Open File and create table
file_path = '/data/PowerGeneration/'
file_names = ['Plant_1_Generation_Data.csv', 'Plant_1_Weather_Sensor_Data.csv', 'Plant_2_Generation_Data.csv', 'Plant_2_Weather_Sensor_Data.csv']
for file_name in file_names:
    # Open File
    file, csv_data = dm.openFile(file_name, ',')
    
    # Find column names
    column_names = next(csv_data)
    
    # Find table names
    table_name = file_name.split('.')[0]
    
    # Create table
    print(dm.createTable(mycursor, column_names, table_name))
    

Command excuted - CREATE TABLE Plant_1_Generation_Data (
        DATE_TIME TIMESTAMP,
        PLANT_ID VARCHAR(512),
        SOURCE_KEY VARCHAR(512),
        DC_POWER DOUBLE UNSIGNED,
        AC_POWER DOUBLE UNSIGNED,
        DAILY_YIELD DOUBLE UNSIGNED,
        TOTAL_YIELD DECIMAL(15,5) UNSIGNED   
        )
Command excuted - CREATE TABLE Plant_1_Weather_Sensor_Data (
        DATE_TIME TIMESTAMP,
        PLANT_ID VARCHAR(512),
        SOURCE_KEY VARCHAR(512),
        AMBIENT_TEMPERATURE DOUBLE,
        MODULE_TEMPERATURE DOUBLE,
        IRRADIATION DOUBLE  
        )
Command excuted - CREATE TABLE Plant_2_Generation_Data (
        DATE_TIME TIMESTAMP,
        PLANT_ID VARCHAR(512),
        SOURCE_KEY VARCHAR(512),
        DC_POWER DOUBLE UNSIGNED,
        AC_POWER DOUBLE UNSIGNED,
        DAILY_YIELD DOUBLE UNSIGNED,
        TOTAL_YIELD DECIMAL(15,5) UNSIGNED   
        )
Command excuted - CREATE TABLE Plant_2_Weather_Sensor_Data (
        DATE_TIME TIMESTAMP,
        PLANT_ID VARCHAR

## 1.5 Load data from Plant_1_Generation_Data.csv to the table

In [6]:
# Load data from Plant_1_Generation_Data.csv to the tables
file_path = '/data/PowerGeneration/Plant_1_Generation_Data.csv'
table_name = 'Plant_1_Generation_Data'
#column_names = ' PLANT_ID, SOURCE_KEY, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION)'
column_names = ' PLANT_ID, SOURCE_KEY, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD)'
datetime_format = '%d-%m-%Y %H:%i'

print(dm.importData(mycursor, file_path, table_name, column_names, datetime_format))

Command excuted - LOAD DATA INFILE '/data/PowerGeneration/Plant_1_Generation_Data.csv' 
    INTO TABLE Plant_1_Generation_Data
    FIELDS TERMINATED BY ',' 
    ENCLOSED BY '"'
    LINES TERMINATED BY '
'
    IGNORE 1 ROWS
    (@DATE_TIME,  PLANT_ID, SOURCE_KEY, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD) 
    SET DATE_TIME = STR_TO_DATE(@DATE_TIME,'%d-%m-%Y %H:%i');
    


## 1.6 Load data from Plant_2_Generation_Data.csv to the table

In [7]:
# Load data from Plant_2_Generation_Data.csv to the tables
file_path = '/data/PowerGeneration/Plant_2_Generation_Data.csv'
table_name = 'Plant_2_Generation_Data'
#column_names = ' PLANT_ID, SOURCE_KEY, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION)'
column_names = ' PLANT_ID, SOURCE_KEY, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD)'
datetime_format = '%Y-%m-%d %H:%i:%s'

print(dm.importData(mycursor, file_path, table_name, column_names, datetime_format))


Command excuted - LOAD DATA INFILE '/data/PowerGeneration/Plant_2_Generation_Data.csv' 
    INTO TABLE Plant_2_Generation_Data
    FIELDS TERMINATED BY ',' 
    ENCLOSED BY '"'
    LINES TERMINATED BY '
'
    IGNORE 1 ROWS
    (@DATE_TIME,  PLANT_ID, SOURCE_KEY, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD) 
    SET DATE_TIME = STR_TO_DATE(@DATE_TIME,'%Y-%m-%d %H:%i:%s');
    


## 1.7 Load data from Plant_1_Weather_Sensor_Data.csv to the table

In [8]:
# Load data from Plant_1_Weather_Sensor_Data.csv to the tables
file_path = '/data/PowerGeneration/Plant_1_Weather_Sensor_Data.csv'
table_name = 'Plant_1_Weather_Sensor_Data'
column_names = ' PLANT_ID, SOURCE_KEY, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION)'
#column_names = ' PLANT_ID, SOURCE_KEY, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD)'
datetime_format = '%Y-%m-%d %H:%i:%s'

print(dm.importData(mycursor, file_path, table_name, column_names, datetime_format))

Command excuted - LOAD DATA INFILE '/data/PowerGeneration/Plant_1_Weather_Sensor_Data.csv' 
    INTO TABLE Plant_1_Weather_Sensor_Data
    FIELDS TERMINATED BY ',' 
    ENCLOSED BY '"'
    LINES TERMINATED BY '
'
    IGNORE 1 ROWS
    (@DATE_TIME,  PLANT_ID, SOURCE_KEY, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION) 
    SET DATE_TIME = STR_TO_DATE(@DATE_TIME,'%Y-%m-%d %H:%i:%s');
    


## 1.8 Load data from Plant_2_Weather_Sensor_Data.csv to the table

In [9]:
# Load data from Plant_2_Weather_Sensor_Data.csv to the tables
file_path = '/data/PowerGeneration/Plant_2_Weather_Sensor_Data.csv'
table_name = 'Plant_2_Weather_Sensor_Data'
column_names = ' PLANT_ID, SOURCE_KEY, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION)'
#column_names = ' PLANT_ID, SOURCE_KEY, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD)'
datetime_format = '%Y-%m-%d %H:%i:%s'

print(dm.importData(mycursor, file_path, table_name, column_names, datetime_format))

Command excuted - LOAD DATA INFILE '/data/PowerGeneration/Plant_2_Weather_Sensor_Data.csv' 
    INTO TABLE Plant_2_Weather_Sensor_Data
    FIELDS TERMINATED BY ',' 
    ENCLOSED BY '"'
    LINES TERMINATED BY '
'
    IGNORE 1 ROWS
    (@DATE_TIME,  PLANT_ID, SOURCE_KEY, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION) 
    SET DATE_TIME = STR_TO_DATE(@DATE_TIME,'%Y-%m-%d %H:%i:%s');
    


## 1.9 Commit and close the MySQL cursor and connection.

In [10]:
# Commit command
db_connection.commit()

In [11]:
# Close cursor
mycursor.close()

# Close connection
db_connection.close()

# 2. Data Cleaning

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import datamanagement as dm
import json

In [None]:
# Set no row limit for displaying dataframe
pd.set_option('display.max_rows', None)

## 2.1 Connect to MySQL server

In [None]:
# Use a JSON file to hide MySQL credentials a little bit.
parameters_file = open('MySQLparameter.json')
parameters_value = json.load(parameters_file)
parameters_file.close()

db_str = ('mysql://{user}:{password}@{host}:{port}').format(**parameters_value)

# Create the connection
db_connection = create_engine(db_str)

In [None]:
## 2.2 Use database

Use database named PowerGeneration

In [None]:
# Use database
database_name = 'PowerGeneration'
command = "USE {}".format(database_name)
mycursor = db_connection.execute(command)

## 2.3 Query data from the 4 tables.

### 2.3.1 Query data from Plant_1_Generation_Data table

In [None]:
table_name = 'Plant_1_Generation_Data'
df_g1 = pd.read_sql('SELECT * FROM {}'.format(table_name), con=db_connection)
display(df_g1.head(5))

### 2.3.2 Query data from Plant_2_Generation_Data table

In [None]:
table_name = 'Plant_2_Generation_Data'
df_g2 = pd.read_sql('SELECT * FROM {}'.format(table_name), con=db_connection)
display(df_g2.head(5))

### 2.3.3 Query data from Plant_1_Weather_Sensor_Data table

In [None]:
table_name = 'Plant_1_Weather_Sensor_Data'
df_w1 = pd.read_sql('SELECT * FROM {}'.format(table_name), con=db_connection)
display(df_w1.head(5))

### 2.3.4 Query data from Plant_2_Weather_Sensor_Data table

In [None]:
table_name = 'Plant_2_Weather_Sensor_Data'
df_w2 = pd.read_sql('SELECT * FROM {}'.format(table_name), con=db_connection)
display(df_w2.head(5))

## 2.3 Visualize the table

### 2.3.1 Check for null values and datatypes of every columns.

In [None]:
df_list = [df_g1, df_g2, df_w1, df_w2]
for df in df_list:
    display(df.info())
    

No null values and the datatypes are correct.

### 2.3.2 Check for frequency, top, mean, max, min

In [None]:
for df in df_list:
    display(df.describe(include='all'))
    

The data looks fine. Even though the maximum values of DC_POWER, AC_POWER, DAILY_YIELD and TOTAL_YIELD are very different from 50% values, in my opinion it is very normal because usually dynamic data such as power has very shape peaks. 

## 2.4 Merging and concating each table.

SOURCE_KEY from generation file is inverter ID, but SOURCE_KEY from weather file is weather sensor ID. So we need to change the column names.

### 2.4.1 Merging generation data and weather data from plant1.

In [None]:
suffixes = ('_INVERTER', '_SENSOR')
df_1 = df_g1.merge(df_w1, left_on=['DATE_TIME', 'PLANT_ID'], right_on=['DATE_TIME', 'PLANT_ID'], how='inner', suffixes=suffixes)
display(df_1.head())

In [None]:
display(df_1.describe(include='all'))

### 2.4.2 Merging generation data and weather data from plant2.

In [None]:
suffixes = ('_INVERTER', '_SENSOR')
df_2 = df_g2.merge(df_w2, left_on=['DATE_TIME', 'PLANT_ID'], right_on=['DATE_TIME', 'PLANT_ID'], how='inner', suffixes=suffixes)
display(df_2.head())

In [None]:
display(df_2.describe(include='all'))

### 2.4.3 Concating data from both plants.

In [None]:
df_all = pd.concat([df_1, df_2])
display(df_all.head())

## 2.5 Seperate DATE_TIME column to DATE and TIME columns.

In [None]:
df_all['DATE'] = [df.date() for df in df_all['DATE_TIME']]
df_all['TIME'] = [df.time() for df in df_all['DATE_TIME']]
display(df_all.head())

## 2.6 Check datatype of every column.

In [None]:
df_all.info()

## 2.7 Change datatype of DATE column from object to datatime.

In [None]:
df_all['DATE'] = pd.to_datetime(df_all['DATE'])

In [None]:
# Recheck
df_all.info()

## 2.8 Export data to MySQL sever.

In [None]:
table_name = 'Plant_All'
dm.fromPDtoMySQL(db_connection, df_all, table_name)

Check the table after done with importing.

In [None]:
df_temp = pd.read_sql('SELECT * FROM {}'.format(table_name), con=db_connection)
display(df_temp.head())

## 2.9 close the MySQL cursor and connection.

In [None]:
# Close cursor
mycursor.close()