# ACCRE Project - Data Import and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import re

In [5]:
# Read in fullsample.csv as a dataframe
jobs = pd.read_csv("../data/fullsample-corrected.csv")

In [3]:
# Convert BEGIN and END columns to datetime type
jobs['BEGIN'] = pd.to_datetime(jobs['BEGIN'], errors='coerce')
jobs['END'] = pd.to_datetime(jobs['END'], errors='coerce')

# Drop all columns with null type in BEGIN or END columns
jobs = jobs.dropna(subset=['BEGIN', 'END'])

# Calculate length of jobs and assign in new column JOBLENGTH
jobs['JOBLENGTH'] = jobs['END'] - jobs['BEGIN']

In [4]:
# Calculate total required memory, required memory per core, and total used memory for each job
for row_tuple in jobs.itertuples():
    memory = row_tuple[5]
    memory_int = int(re.findall(r'\d+', memory)[0]) 
    used_mem = int(re.findall(r'\d+', row_tuple[6])[0])
    nodes = row_tuple[9] 
    cpus = row_tuple[10]
    total_used_mem = used_mem * nodes
    
    if 'Mn' in memory:
        total_memory = memory_int * nodes
        jobs.at[row_tuple[0], 'REQMEMTOT'] = total_memory
        if cpus == 0:
            jobs.at[row_tuple[0], 'REQMEMPERCORE'] = total_memory
        else:
            jobs.at[row_tuple[0], 'REQMEMPERCORE'] = total_memory / cpus
    elif 'Mc' in memory:
        total_memory = memory_int * cpus
        jobs.at[row_tuple[0], 'REQMEMTOT'] = total_memory
        jobs.at[row_tuple[0], 'REQMEMPERCORE'] = memory_int
        
    if used_mem == 0:
        jobs.at[row_tuple[0], 'USEDMEMTOT'] = 0
    else:
        jobs.at[row_tuple[0], 'USEDMEMTOT'] = total_used_mem

In [5]:
# Read in ce5 log file
ce5 = pd.read_table("../data/slurm_wrapper_ce5.log", 
                    engine='python',
                    sep=" - ", 
                    names=['DATE', 
                           'USER', 
                           'RETRY',
                           'TIMELAPS',
                           'RETURNCODE',
                           'COMMAND'])

# Create new column in ce5 dataframe for which server the data came from
ce5['SERVERID'] = 'ce5'

# Read in ce6 log file
ce6 = pd.read_table("../data/slurm_wrapper_ce6.log", 
                    engine='python',
                    sep=" - ", 
                    names=['DATE', 
                           'USER', 
                           'RETRY',
                           'TIMELAPS',
                           'RETURNCODE',
                           'COMMAND'])

# Create new column in ce6 dataframe for which server the data came from
ce6['SERVERID'] = 'ce6'

# Combine the two log files into one dataframe
logs = pd.concat([ce5, ce6])

In [6]:
# Export cleaned dataframe as a csv
jobs.to_csv('../data/fullsample_cleaned.csv', index=False)

# Export logs as a csv
logs.to_csv('../data/logs.csv', index=False)