Github Issue: 
This notebook defines the schema and the usage/ manipulation of each column of the datasource to be stored in the database. Aim for minimum schema to answer the following questions:
- cpueff/walltime/failure rate of jobs BY input data tier/site/mc production/offsite vs onsite

## schema

In [None]:
#Possible (minimum) schema for the first datasource:
def _get_schema():
    return StructType(
        [
            StructField(
                "data",
                StructType(
                    [
                        StructField("RecordTime", LongType(), nullable=False), #Could store as YYYY:MM:DD or HH YYYY:MM:DD
                        StructField("CMSSite", StringType(), nullable=True),
                        StructField("InputData", StringType(), nullable=True), #['Onsite', 'Offsite']
                        StructField("Chirp_CRAB3_Job_ExitCode", LongType(), nullable=True), #Not manipulated yet
                        StructField("CMSPrimaryDataTier", StringType(), nullable=True),
                        StructField("Status", StringType(), nullable=True), #Only includes ['Removed','Completed','Held','Error']
                        StructField("OverflowType", StringType(), nullable=True),
                        StructField("WallClockHr", DoubleType(), nullable=True),
                        StructField("CpuTimeHr", DoubleType(), nullable=True),
                        StructField("RequestCpus", LongType(), nullable=True),
                        StructField("Type", StringType(), nullable=True),
                        StructField("CRAB_DataBlock", StringType(), nullable=True), #For identifying MC Prod
                    ]
                ),
            ),
        ]
    )

## definitions of the content

In [None]:
def _cal_avg_cpu_eff(CpuTimeHr,WallClockHr,RequestCpus):
    return (100*CpuTimeHr)/(WallClockHr*RequestCpus)

#note - WallClockHr*RequestCpus = CoreHr
#referenced - https://github.com/dmwm/cms-htcondor-es/blob/7fdcb7667b39081ddff98da26ad4e3ed33f9e244/src/htcondor_es/convert_to_json.py#L838

In [None]:
def _manipulate_chirp_crab3_exitcode(Chirp_CRAB3_Job_ExitCode):
    if(Chirp_CRAB3_Job_ExitCode==0):
        return ('Success')
    else if (Chirp_CRAB3_Job_ExitCode!=0 && Chirp_CRAB3_Job_ExitCode.isNotNull()):
        return ('Fail')
    else:
        #when Chirp_CRAB3_Job_ExitCode.isNull()
        return ('Not Analysis Job')
#note - Chirp_CRAB3_Job_ExitCode might not cover the case where jobs get cancelled bc of user's mistake
#note - Should discuss Chirp_CRAB3_Job_ExitCode vs ExitCode

In [None]:
def _is_mc_prod_job(CRAB_DataBlock, Type):
    if(CRAB_DataBlock=='MCFakeBlock' && Type=='analysis'):
        return True
    else:
        return False