In [None]:
import pandas as pd
import numpy as np
import math
import warnings
from sqlalchemy import *
warnings.filterwarnings('ignore')
from tqdm.notebook import tqdm

In [None]:
# Create database/engine for connection.
engine = create_engine('sqlite:///data/jobs.db')
connection = engine.connect()

# Create MetaData object and Table in sqlalchemy.
# Table will be used for each df read in chunks.
metadata = MetaData()

jobs_corrected = Table('jobs_corrected', metadata,
                       Column('JOBID', Integer()),
                       Column('STATE', String(255)),
                       Column('BEGIN', String(255)),
                       Column('END', String(255)),
                       Column('REQTIME', Float()),
                       Column('USEDTIME', Float()),
                       Column('EXITCODE', String(255)),
                       Column('USEDMEMPERCORE', Float()),
                       Column('REQMEMPERCORE', Float()),
                       Column('PARTITION', String(255))
)

metadata.create_all(engine)

In [None]:
# Create iterator for chunks from csv file.
reader = pd.read_csv('data/fullsample-corrected.csv', chunksize = 100000)

# Iterate through each chunk and clean it then upload it to the SQL Table.
for df in tqdm(reader):
    df = df.loc[(df['BEGIN'] != 'Unknown') & (df['END'] != 'Unknown')]
    df['BEGIN'] = df['BEGIN'].apply(pd.to_datetime,
                                    format = '%Y-%m-%dT%H:%M:%S',
                                    errors = 'coerce')
    df['END'] = df['END'].apply(pd.to_datetime,
                                format = '%Y-%m-%dT%H:%M:%S',
                                errors = 'coerce')
    df['REQTIME'] = abs(pd.to_timedelta(df['REQTIME']).apply(lambda x: x.total_seconds()))
    df['USEDTIME'] = abs(pd.to_timedelta(df['USEDTIME']).apply(lambda x: x.total_seconds()))
    df['USEDMEMMEAS'] = df['USEDMEM'].str.extract(r'\d+([Mnc]*)', expand = True)
    df['USEDMEM'] = df['USEDMEM'].str.extract(r'(\d+)', expand = True).astype('float64')
    df['REQMEMMEAS'] = df['REQMEM'].str.extract(r'\d+([Mnc]*)', expand = True)
    df['REQMEM'] = df['REQMEM'].str.extract(r'(\d+)', expand = True).astype('float64')
    df['USEDMEMPERCORE'] = (
        np.where((df['USEDMEMMEAS'] == 'Mn')\
                 | (df['USEDMEMMEAS'] == 'M'),\
                 (df['USEDMEM'] * df['NODES'])/(df['CPUS']),\
                 df['USEDMEM'])
    )
    df['REQMEMPERCORE'] = (
        np.where((df['REQMEMMEAS'] == 'Mn')\
                 | (df['REQMEMMEAS'] == 'M'),\
                 (df['REQMEM'] * df['NODES'])/(df['CPUS']),\
                 df['REQMEM'])
    )
    df['JOBID'] = df['JOBID'].astype('int64')
    df = df[['JOBID',
             'STATE',
             'BEGIN',
             'END',
             'REQTIME',
             'USEDTIME',
             'EXITCODE',
             'USEDMEMPERCORE',
             'REQMEMPERCORE',
             'PARTITION']]
    df.to_sql(name = 'jobs_corrected', 
          con = connection, 
          if_exists = 'append', 
          index = False)