@author Lelia Deville
@date 3/9/24

In [1]:
import pandas as pd
import pypyodbc as odbc
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta, date


"""
defining set of functions used to read data in LNDB format at LaSEL
"""


def connect_server(driver_name, server_name, database_name, uid, password):
    """
    Parameters
    ----------
    driver_name : should be SQL Server, unless this changes
    server_name : name of host computer, typically using 192.168.1.109 or PARTLABHMI
    database_name : depends on what data is wanted; for LNDB format it should be EI
    uid : this could be unique or shared across student groups, see Robert if unknown
    password : this could be unique or shared across student groups, see Robert if unknown
    
    
    Returns
    -------
    cursor : object used to query into databases
    """
    
    connection_string = f"""
    DRIVER={{{driver_name}}};
    SERVER={server_name};
    DATABASE={database_name};
    Trust_Connection =yes;
    uid={uid};
    pwd={password};
    """
    conn = odbc.connect(connection_string)

    return conn.cursor()


def read_one_table(cursor, table_name, start, end, columns=None):

    """
    Parameters
    ----------
    cursor: cursor object used to query in databases, obtained by running 'connect_server' function above
    table_name : name of table in database to read data from
    start : date to start query. note: if the query is started before data is recorded or available,
    the first datapoint will be when data is actually available, assuming it is before the end date
    end : date to end query. note: if the data ends before the query date, the last datapoint will be the final
    recorded datapoint, not the end date.
    columns : optional. if you only want certain columns from the df, then specify in the form of a constant string
    example - 'tmstamp,ws_ms_avg,rh,diffused_avg,dni_avg,global_avg,poa_avg,cmp10poa_avg,cmp10_2_poa_avg'
    
    Returns
    -------
    df : df of data with all (or specified) columns within the start and end window
    the returned df is localized to LaSEL timezone
    """
    
    
    if columns:
        cols = columns
        col_names = ', '.join(cols)
        cursor.execute(f"select {col_names} from {table_name} where TmStamp between '{start}' and '{end}';")
        data = []
        for row in cursor:
            data.append(row)
        num_fields = len(cursor.description)
        field_names = ([i[0] for i in cursor.description])
        df = pd.DataFrame(data, columns=[field_names])
        df.columns = df.columns.get_level_values(0)
        df.index = pd.to_datetime(df['tmstamp'])
        df.index = df.index.tz_localize('America/Chicago', ambiguous=True, nonexistent='shift_forward')
        df.drop(columns='tmstamp', inplace=True)
    else:
        cursor.execute(f"select * from {table_name} where TmStamp between '{start}' and '{end}';")
        data = []
        for row in cursor:
            data.append(row)
        num_fields = len(cursor.description)
        field_names = ([i[0] for i in cursor.description])
        df = pd.DataFrame(data, columns=[field_names])
        df.columns = df.columns.get_level_values(0)
        df.index = pd.to_datetime(df['tmstamp'])
        df.index = df.index.tz_localize('America/Chicago', ambiguous=True, nonexistent='shift_forward')
        df.drop(columns='tmstamp', inplace=True)
    return df


def read_two_tables(cursor, table1, table2, start, end, cols1 = None, cols2 = None):
     """
    Parameters
    ----------
    cursor: cursor object used to query in databases, obtained by running 'connect_server' function above
    table1 : name of first table in database to read data from
    table2: name of second table in database to read data from
    start : date to start query. note: if the query is started before data is recorded or available,
    the first datapoint will be when data is actually available, assuming it is before the end date
    end : date to end query. note: if the data ends before the query date, the last datapoint will be the final
    recorded datapoint, not the end date.
    cols1 : optional. if you only want certain columns from the first df, then specify in the form of a constant string
    example - 'tmstamp,ws_ms_avg,rh,diffused_avg,dni_avg,global_avg,poa_avg,cmp10poa_avg,cmp10_2_poa_avg'
    cols2 : optional. columns to use from second table
    
    Returns
    -------
    df : df of data with all (or specified) columns from both tables merged within the start and
    end window. note the returned df is localized to LaSEL timezone
    """
    
     if cols1:
          cols1 = cols1
          col_names = ', '.join(cols1)
          cursor.execute(f"select {col_names} from {table1} where TmStamp between '{start}' and '{end}';")
          data = []
          for row in cursor:
              data.append(row)
          num_fields = len(cursor.description)
          field_names = ([i[0] for i in cursor.description])
          df1 = pd.DataFrame(data, columns=[field_names])
          df1.columns = df1.columns.get_level_values(0)
          df1.index = pd.to_datetime(df1['tmstamp'])
          df1.index = df1.index.tz_localize('America/Chicago', ambiguous=True, nonexistent='shift_forward')
          df1.drop(columns='tmstamp', inplace=True)

     else:
          cursor.execute(f"select * from {table1} where TmStamp between '{start}' and '{end}';")
          data = []
          for row in cursor:
               data.append(row)
          num_fields = len(cursor.description)
          field_names = ([i[0] for i in cursor.description])
          df1 = pd.DataFrame(data, columns=[field_names])
          df1.columns = df1.columns.get_level_values(0)
          df1.index = pd.to_datetime(df1['tmstamp'])
          df1.index = df1.index.tz_localize('America/Chicago', ambiguous=True, nonexistent='shift_forward')
          df1.drop(columns='tmstamp', inplace=True)

     if cols2:
          cols2 = cols2
          col_names = ', '.join(cols2)
          cursor.execute(f"select {col_names} from {table2} where TmStamp between '{start}' and '{end}';")
          data = []
          for row in cursor:
               data.append(row)
          num_fields = len(cursor.description)
          field_names = ([i[0] for i in cursor.description])
          df2 = pd.DataFrame(data, columns=[field_names])
          df2.columns = df2.columns.get_level_values(0)
          df2.index = pd.to_datetime(df2['tmstamp'])
          df2.index = df2.index.tz_localize('America/Chicago', ambiguous=True, nonexistent='shift_forward')
          df2.drop(columns='tmstamp', inplace=True)
     else:
          cursor.execute(f"select * from {table2} where TmStamp between '{start}' and '{end}';")
          data = []
          for row in cursor:
               data.append(row)
          num_fields = len(cursor.description)
          field_names = ([i[0] for i in cursor.description])
          df2 = pd.DataFrame(data, columns=[field_names])
          df2.columns = df2.columns.get_level_values(0)
          df2.index = pd.to_datetime(df2['tmstamp'])
          df2.index = df2.index.tz_localize('America/Chicago', ambiguous=True, nonexistent='shift_forward')
          df2.drop(columns='tmstamp', inplace=True)

     final_df = df1.merge(right=df2, how='inner', left_index=True, right_index=True)


     return final_df