In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
%matplotlib inline
pd.options.display.max_rows = 999
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
from ast import literal_eval
def clean(x):
    return literal_eval(x)

In [3]:
accre_jobs_piped = pd.read_csv('../data/accre_jobs_cleaned.csv', converters={'node_numbers': clean})

# Maggi -- 
    Question: 3 & 4   Calculation of % of actual memory used and mean % for each group

## Notes on conversion of Mn to Mc
For Mn, convert to Mc as follows: digits / (cpus/nodes)<br>
For Mc, use the digits as is<br>
usedmem is stated in Mn, but designated as M or possibly with no designation<br>
recmem can be stated in either Mc, Mn, or none<br>
Question: The documentation says that cpus is the total cpus for the job (i.e., not the cpus per node);
    however, some jobs have 1 cpu and 2 nodes, 1 cpu and 51 nodes; so, I don't understand that.<br>
Make two new columns in memory_use reqmem_mc and usedmem_mc

In [4]:
#display the data set
accre_jobs_piped.head(2)

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,req_total_seconds,used_total_seconds,reqmem_mc,usedmem_mc
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,['cn1531'],1188000,1188028,5120.0,2748.895417
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,['cn1441'],1188000,1090256,5120.0,2799.213333


In [5]:
#look at our groupings of 'state'
accre_jobs_piped['state'].value_counts()
#we will just want to use 'COMPLETED'

COMPLETED              3718637
CANCELLED                 6064
RUNNING                   3105
FAILED                    1379
CANCELLED by 686562        681
OUT_OF_MEMORY               94
TIMEOUT                     80
CANCELLED by 505355         63
CANCELLED by 855431         17
CANCELLED by 782611         13
CANCELLED by 397600          7
CANCELLED by 0               5
CANCELLED by 454080          4
CANCELLED by 486541          4
CANCELLED by 199766          4
CANCELLED by 200557          4
CANCELLED by 9202            3
CANCELLED by 649319          3
CANCELLED by 483348          3
CANCELLED by 9201            3
CANCELLED by 515423          2
CANCELLED by 199066          2
CANCELLED by 666860          2
CANCELLED by 791651          2
CANCELLED by 90423           2
CANCELLED by 546080          2
CANCELLED by 895426          2
CANCELLED by 124006          2
CANCELLED by 693461          2
CANCELLED by 782535          1
CANCELLED by 651701          1
CANCELLED by 763605          1
CANCELLE

In [6]:
#look at our groupings of 'partition'
accre_jobs_piped['partition'].value_counts()
#we will just want to use 'production'

production    3311788
nogpfs         327652
pascal          48004
turing          39406
maxwell          3348
Name: partition, dtype: int64

In [7]:
#look at 'exitcode' = 0:0 in 'production' by 'state'
accre_jobs_piped[(accre_jobs_piped['exitcode']=='0:0') & (accre_jobs_piped['partition']=='production')]['state'].value_counts()
#definitely limit to completed based on the 'cancelled*' and 'running' entries

COMPLETED              3302522
CANCELLED                 4449
RUNNING                   2859
CANCELLED by 686562        680
TIMEOUT                     79
CANCELLED by 9201            2
Name: state, dtype: int64

In [8]:
#pull out the initial slice of data for memory usage analysis
memory_use = accre_jobs_piped[(accre_jobs_piped['partition']=='production') & (accre_jobs_piped['exitcode']=='0:0')
                              & (accre_jobs_piped['state']=='COMPLETED')]

In [9]:
memory_use.head(2)

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,req_total_seconds,used_total_seconds,reqmem_mc,usedmem_mc
0,15925210,treviso,arabella,122880Mn,65973.49M,13-18:00:00,13-18:00:28,1,24,production,0:0,COMPLETED,['cn1531'],1188000,1188028,5120.0,2748.895417
1,15861126,treviso,arabella,122880Mn,67181.12M,13-18:00:00,12-14:50:56,1,24,production,0:0,COMPLETED,['cn1441'],1188000,1090256,5120.0,2799.213333


In [10]:
memory_use.shape

(3302522, 17)

In [11]:
#examples where reqmem in Mc and nodes > 1
memory_use[(memory_use['reqmem'].str.contains('Mc')) & (memory_use['nodes']>1)][:10]

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,req_total_seconds,used_total_seconds,reqmem_mc,usedmem_mc
13240,17105760,summer,buddy,20480Mc,6453.04M,04:40:00,00:00:56,2,1,production,0:0,COMPLETED,"['cn1434', 'cn1435']",16800,56,20480.0,12906.08
13257,17105819,summer,buddy,20480Mc,7587.56M,04:40:00,00:11:43,2,1,production,0:0,COMPLETED,"['cn1434', 'cn1435']",16800,703,20480.0,15175.12
13342,17105966,summer,buddy,20480Mc,7587.15M,04:40:00,00:14:03,2,1,production,0:0,COMPLETED,"['cn1434', 'cn1435']",16800,843,20480.0,15174.3
26744,17125144,summer,buddy,20480Mc,20479.96M,04:40:00,00:17:50,3,1,production,0:0,COMPLETED,"['cn1448', 'cn1449', 'cn1450']",16800,1070,20480.0,61439.88
27044,17125676,summer,buddy,20480Mc,20463.48M,04:40:00,00:34:25,3,1,production,0:0,COMPLETED,"['cn1465', 'cn1466', 'cn1467']",16800,2065,20480.0,61390.44
28241,17126611,summer,buddy,20480Mc,6712.42M,04:40:00,00:03:37,3,1,production,0:0,COMPLETED,"['cn1430', 'cn1431', 'cn1432']",16800,217,20480.0,20137.26
61606,17173216,arra,needham,8192Mc,65535.89M,06:00:00,00:06:44,19,8,production,0:0,COMPLETED,"['cn1281', 'cn1283', 'cn1297', 'cn1298', 'cn13...",21600,404,8192.0,155647.73875
118478,17255274,belle,osbaldo,8192Mc,0,1-00:00:00,00:01:15,4,12,production,0:0,COMPLETED,"['cn1293', 'cn1302', 'cn1355', 'cn1356']",86400,75,8192.0,0.0
160741,17324222,crowns,tena,5120Mc,9651.66M,2-00:00:00,16:47:47,37,1,production,0:0,COMPLETED,"['cn1094', 'cn1125', 'cn1266', 'cn1277', 'cn12...",172800,60467,5120.0,357111.42
164606,17329000,round,shenna,8000Mc,0,10:00:00,00:00:12,2,1,production,0:0,COMPLETED,"['cn1291', 'cn1292']",36000,12,8000.0,0.0


In [12]:
#it looks like there are some jobs with 0 usedmem, does that make sense?
memory_use['usedmem_mc'].value_counts()
#no it does not, these don't seem valid for evaluating efficiency

0.0000       537283
1.5600        31241
0.1975        30085
1.5575        29904
6.2300        16918
              ...  
3493.0900         1
1416.2400         1
1393.7020         1
2559.6200         1
4122.6600         1
Name: usedmem_mc, Length: 884478, dtype: int64

In [13]:
#here's what the data looks like
memory_use[memory_use['usedmem_mc'] ==0]['used_total_seconds'].describe().apply(lambda x: format(x, 'f'))
#most are not long running jobs, but some in the top 25% are

count     537283.000000
mean          37.988003
std         2058.355733
min            0.000000
25%            7.000000
50%           12.000000
75%           20.000000
max      1013546.000000
Name: used_total_seconds, dtype: object

In [14]:
#since we'll be calculating ratios to look at efficiency, we don't want any rows where reqmem or usedmem is 0
#also remove a few stray rows where cpus = 0
memory_use = memory_use[(memory_use['reqmem_mc']>0) & (memory_use['usedmem_mc']>0) & (memory_use['cpus']>0)]

In [15]:
#some jobs have fewer cpus than nodes, which doesn't make sense
check_cpus = memory_use[memory_use['cpus']<memory_use['nodes']][['account','cpus','nodes','reqmem', 'usedmem', 'reqmem_mc','usedmem_mc' ]]

In [16]:
#here's what that data looks like
check_cpus
#it's probably not valid for efficiency ratios

Unnamed: 0,account,cpus,nodes,reqmem,usedmem,reqmem_mc,usedmem_mc
3453,winged,1,3,4096Mn,570.32M,12288.000000,1710.960000
3454,winged,1,4,4096Mn,669.61M,16384.000000,2678.440000
3457,winged,1,3,4096Mn,620.31M,12288.000000,1860.930000
3460,winged,1,4,4096Mn,629.38M,16384.000000,2517.520000
3464,winged,1,3,4096Mn,641.92M,12288.000000,1925.760000
...,...,...,...,...,...,...,...
3626272,burro,3,20,51200Mn,53.06M,341333.333333,353.733333
3629669,burro,3,20,51200Mn,235.39M,341333.333333,1569.266667
3669106,burro,3,20,51200Mn,102.50M,341333.333333,683.333333
3669107,burro,3,20,51200Mn,99.59M,341333.333333,663.933333


In [17]:
#is it limited to particular accounts?
check_cpus['account'].value_counts()
#not particularly

winged          294
chickpeas       293
angelys         196
sharlin         186
wood             62
retailer         52
burro            35
mignonette       15
crowns           13
arra             11
casaba            6
summer            6
peanuts           6
carrot            4
crab              3
grisette          3
blackberries      3
clara             1
horseradish       1
delblush          1
cms_lowprio       1
Name: account, dtype: int64

In [18]:
memory_use['account'].value_counts()

cep              464990
cms              452991
summer           372681
carrot           179941
galia            172315
cms_lowprio      116401
casaba           104134
portabella       103329
bunch             88829
orange            76909
bulk              49270
horseradish       44361
sharlin           43898
sweet             38711
key               34944
mixed             34099
retailer          32049
boysenberries     27957
wood              26226
round             23557
cmsadmin          23421
atemoyas          22084
garlic            19329
wax               17287
alkmene           15532
grisette          15176
celery            14510
winged            14108
sunburst          12876
fresh             11025
chickpeas         10486
regal              9641
large              9019
texas              8606
minnewashta        6555
clara              6324
belle              5687
lychees            5454
st                 5335
muscat             4774
bertanne           4405
seeded          

In [19]:
#remove the rows where cpus < nodes
memory_use = memory_use[memory_use['cpus']>=memory_use['nodes']]

In [20]:
memory_use['used_hours'] = (memory_use['used_total_seconds'] / 3600)

In [21]:
#memory weighting formula
#full_df['MEM_WEIGHT'] = full_df['MEM_PERCENT_USED']*full_df['TOTAL_SEC_USED']
#full_df.groupby('ACCOUNT')['MEM_WEIGHT'].sum()/full_df.groupby('ACCOUNT')['TOTAL_SEC_USED'].sum()
#alkmene 38.351490
#almonds 46.785636
#amaranth 4.135943

In [22]:
memory_use['avg_mem_use'] = memory_use['usedmem_mc'] / memory_use['reqmem_mc'] * 100

In [23]:
memory_use['mem_weight'] = memory_use['avg_mem_use'] * memory_use['used_total_seconds']

In [24]:
memory_use.groupby('account')['mem_weight'].sum() / memory_use.groupby('account')['used_total_seconds'].sum()

account
alkmene          38.398614
almonds          21.713587
amaranth          4.136165
angelys           2.701473
anise            50.589604
antares          67.351935
arra             61.596102
atemoyas         14.127690
baby             14.528333
baking           42.547142
bay               5.710036
bell              6.252304
belle            24.432158
bertanne         25.736394
bibb             68.479948
blackberries     30.977459
bon              19.198852
boysenberries    12.872584
brazilnuts       33.754403
broccoli         49.645722
bulk             46.143998
bunch            15.861109
burro            43.340971
cactus           33.614457
cantared         19.378149
carambola        21.421409
carrot           45.486137
casaba           29.247738
celery           15.567764
cep              33.662324
chasselas        66.380879
chayote           1.515647
chickpeas        34.243716
chipilin         46.867197
cinnabar         37.181179
clara            12.245560
cms              40.

In [25]:
memory_use.sort_values('used_total_seconds', ascending=False)

Unnamed: 0,jobid,account,user,reqmem,usedmem,reqtime,usedtime,nodes,cpus,partition,exitcode,state,nodelist,req_total_seconds,used_total_seconds,reqmem_mc,usedmem_mc,used_hours,avg_mem_use,mem_weight
1173511,19005552,sugranineteen,daron,4096Mn,544.95M,12-12:00:00,19-20:04:25,1,1,production,0:0,COMPLETED,['cn1135'],1080000,1713865,4096.000000,544.950000,476.073611,13.304443,2.280202e+07
1173522,19005634,sugranineteen,daron,4096Mn,556.69M,12-12:00:00,19-19:53:24,1,1,production,0:0,COMPLETED,['cn1126'],1080000,1713204,4096.000000,556.690000,475.890000,13.591064,2.328427e+07
1173500,19005517,sugranineteen,daron,4096Mn,591.11M,12-12:00:00,19-15:04:26,1,1,production,0:0,COMPLETED,['cn1088'],1080000,1695866,4096.000000,591.110000,471.073889,14.431396,2.447371e+07
1173501,19005518,sugranineteen,daron,4096Mn,589.16M,12-12:00:00,19-15:04:18,1,1,production,0:0,COMPLETED,['cn1124'],1080000,1695858,4096.000000,589.160000,471.071667,14.383789,2.439286e+07
1173544,19005936,sugranineteen,daron,4096Mn,544.98M,12-12:00:00,19-14:48:21,1,1,production,0:0,COMPLETED,['cn1131'],1080000,1694901,4096.000000,544.980000,470.805833,13.305176,2.255096e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3537368,23797646,alkmene,abigail,47104Mn,104.74M,1-00:00:00,00:00:00,1,6,production,0:0,COMPLETED,['cn1266'],86400,0,7850.666667,17.456667,0.000000,0.222359,0.000000e+00
291869,17500485,baking,christa,184320Mn,475.46M,1-00:09:00,00:00:00,1,12,production,0:0,COMPLETED,['cn1326'],86940,0,15360.000000,39.621667,0.000000,0.257954,0.000000e+00
232476,17412039,baking,christa,184320Mn,184319.70M,1-00:09:00,00:00:00,1,12,production,0:0,COMPLETED,['cn1509'],86940,0,15360.000000,15359.975000,0.000000,99.999837,0.000000e+00
3423542,23564776,alkmene,abigail,47104Mn,1569.34M,1-00:00:00,00:00:00,1,6,production,0:0,COMPLETED,['cn1386'],86400,0,7850.666667,261.556667,0.000000,3.331649,0.000000e+00


# Plot avg mem usage

In [26]:
avg_memuse_byacct = memory_use.groupby(['account'])[['reqmem_mc', 'usedmem_mc', 'used_hours']].sum().reset_index()

In [27]:
#add a column for the unweighted average
avg_memuse_byacct['avg_usage'] = avg_memuse_byacct['usedmem_mc'] / avg_memuse_byacct['reqmem_mc'] * 100

In [28]:
#add a column for average memory usage quartile
avg_memuse_byacct['mem_quartile'] = pd.qcut(avg_memuse_byacct['avg_usage'], q = 4, labels=['Q1', 'Q2','Q3','Q4'])

In [29]:
#add a column for used_hours quartile
avg_memuse_byacct['hours_quartile'] = pd.qcut(avg_memuse_byacct['used_hours'], q = 4, 
                            labels=[1,2,3,4])

In [43]:
#add a column for used_hours category
avg_memuse_byacct['total_time_ctgry'] = pd.qcut(avg_memuse_byacct['used_hours'], q = 4,
                            labels=['Ttl Time: Low', 'Ttl Time: MedLo','Ttl Time: MedHi','Ttl Time: High Usage'])

In [44]:
avg_memuse_byacct.sort_values('avg_usage', ascending=False)

Unnamed: 0,account,reqmem_mc,usedmem_mc,used_hours,avg_usage,mem_quartile,hours_quartile,total_time_ctgry
75,medium,15176700.0,10460970.0,1397.343,68.927785,Q4,2,Ttl Time: MedLo
76,mignonette,225280.0,154259.2,30.34556,68.474414,Q4,1,Ttl Time: Low
113,strawberries,7549013.0,5079764.0,565.8872,67.290431,Q4,2,Ttl Time: MedLo
5,antares,365568.0,242608.1,117.1669,66.364693,Q4,1,Ttl Time: Low
109,shell,45056.0,27821.08,720.6506,61.747769,Q4,2,Ttl Time: MedLo
110,shiny,1686923.0,1005946.0,539.0319,59.632036,Q4,2,Ttl Time: MedLo
60,jicama,11587240.0,6767345.0,6157.806,58.403407,Q4,3,Ttl Time: MedHi
120,texas,36681870.0,21125020.0,20852.84,57.589806,Q4,3,Ttl Time: MedHi
6,arra,555827.2,303167.0,155.9681,54.543394,Q4,1,Ttl Time: Low
107,shallots,3246080.0,1748024.0,29.355,53.850308,Q4,1,Ttl Time: Low


In [45]:
avg_memuse_byacct['avg_usage'].describe().apply(lambda x: format(x, 'f'))

count    131.000000
mean      23.542496
std       18.179408
min        0.178467
25%        8.016400
50%       19.242148
75%       34.331756
max       68.927785
Name: avg_usage, dtype: object

In [46]:
q1cut = avg_memuse_byacct['avg_usage'].quantile(.25)
q2cut = avg_memuse_byacct['avg_usage'].quantile(.5)
q3cut = avg_memuse_byacct['avg_usage'].quantile(.75)
max =  avg_memuse_byacct['avg_usage'].max()
print(max)


68.92778486027015


In [51]:
mem_plot_avg = avg_memuse_byacct[['account','avg_usage','used_hours','mem_quartile', 'hours_quartile','total_time_ctgry']]#.set_index('account')
#mycolorscale = 'haline'

df=mem_plot_avg.sort_values(['avg_usage'], ascending=False)
fig = make_subplots(
                    rows=2, cols=2,
                    subplot_titles = ('Low Efficiency', 'MedLo Efficiency', 'MedHi Efficiency', 'High Efficiency'),
                    vertical_spacing = 0.1,
                    horizontal_spacing = 0.2                    
                   )

fig.add_trace(go.Bar(y=df.loc[df['mem_quartile']=='Q1','account'], x=df.loc[df['mem_quartile']=='Q1','avg_usage'] 
                     ,orientation='h' 
                     ,hovertext=df['total_time_ctgry']
                     #,marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     , name='Low'), row=1, col=1)
fig.add_trace(go.Bar(y=df.loc[df['mem_quartile']=='Q2','account'], x=df.loc[df['mem_quartile']=='Q2','avg_usage']
                     ,orientation='h'
                     ,hovertext=df['total_time_ctgry']
                     #, marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     , name='MedLo'), row=1, col=2)
fig.add_trace(go.Bar(y=df.loc[df['mem_quartile']=='Q3','account'], x=df.loc[df['mem_quartile']=='Q3','avg_usage']
                     ,orientation='h'
                     ,hovertext=df['total_time_ctgry']
                     #, marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     , name='MedHi'), row=2, col=1)
fig.add_trace(go.Bar(y=df.loc[df['mem_quartile']=='Q4','account'], x=df.loc[df['mem_quartile']=='Q4','avg_usage']
                     ,orientation='h'
                     ,hovertext=df['total_time_ctgry']
                     #, marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     ,name='High'), row=2, col=2)

fig.update_yaxes(type='category')
fig.update_xaxes(range=[q1cut-.5, q2cut+2], row=1, col=2)
fig.update_xaxes(range=[q2cut-.5,q3cut+2], row=2, col=1)
fig.update_xaxes(range=[q3cut-.5,max+2], row=2, col=2)

fig.update_layout(
                    title_text = 'ACCRE Accounts: Memory Usage as Percent of Memory Requested',             
                    showlegend=False,
                    height=1000#,
                    #coloraxis=dict(colorscale=mycolorscale, colorbar=dict(dtick=1, title='Total Time Quartile', titleside='right'))
                    )
#fig.show()
#marker=dict(color=df['hours_quartile'], colorscale=mycolorscale), 

In [52]:
mem_plot_avgT = avg_memuse_byacct[['account','avg_usage','used_hours','mem_quartile', 'hours_quartile']]#.set_index('account')
#mycolorscale = 'haline'

df=mem_plot_avg.sort_values(['avg_usage'], ascending=False)
figT = make_subplots(
                    rows=2, cols=2,
                    subplot_titles = ('Low Time Usage', 'MedLo Time Usage', 'MedHi Time Usage', 'High Time Usage'),
                    vertical_spacing = 0.1,
                    horizontal_spacing = 0.2                    
                   )

figT.add_trace(go.Bar(y=df.loc[df['hours_quartile']==1,'account'], x=df.loc[df['hours_quartile']==1,'avg_usage'] 
                     ,orientation='h' 
                     #,marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     , name='Low'), row=1, col=1)
figT.add_trace(go.Bar(y=df.loc[df['hours_quartile']==2,'account'], x=df.loc[df['hours_quartile']==2,'avg_usage']
                     ,orientation='h'
                     #, marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     , name='MedLo'), row=1, col=2)
figT.add_trace(go.Bar(y=df.loc[df['hours_quartile']==3,'account'], x=df.loc[df['hours_quartile']==3,'avg_usage']
                     ,orientation='h'
                     #, marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     , name='MedHi'), row=2, col=1)
figT.add_trace(go.Bar(y=df.loc[df['hours_quartile']==4,'account'], x=df.loc[df['hours_quartile']==4,'avg_usage']
                     ,orientation='h'
                     #, marker=dict(color=df['hours_quartile'], coloraxis='coloraxis')
                     ,name='High'), row=2, col=2)

figT.update_yaxes(type='category')
#fig.update_xaxes(range=[q1cut-.5, q2cut+2], row=1, col=2)
#fig.update_xaxes(range=[q2cut-.5,q3cut+2], row=2, col=1)
#fig.update_xaxes(range=[q3cut-.5,max+2], row=2, col=2)

figT.update_layout(
                    title_text = 'ACCRE Accounts: Memory Usage as Percent of Memory Requested',             
                    showlegend=False,
                    height=1000#,
                    #coloraxis=dict(colorscale=mycolorscale, colorbar=dict(dtick=1, title='Total Time Quartile', titleside='right'))
                    )
#figT.show()

In [55]:
fig.show()

In [54]:
figT.show()

In [None]:
wavg_memuse_byacct = (memory_use.groupby('account')['mem_weight'].sum() / \
    memory_use.groupby('account')['used_total_seconds'].sum()).reset_index()
wavg_memuse_byacct.columns = ('account','wavg_usage')

In [None]:
wavg_memuse_byacct['quartile'] = pd.qcut(wavg_memuse_byacct['wavg_usage'], q = 4, labels=['1', '2','3','4'])

In [None]:
wavg_memuse_byacct.head()

In [None]:
wavg_memuse_byacct.sort_values('wavg_usage', ascending=False)

In [None]:
mem_plot_q1wavg = wavg_memuse_byacct[wavg_memuse_byacct['quartile'] == '1'][['account','wavg_usage']].set_index('account')
px_dfq1w=mem_plot_q1wavg.sort_values(['wavg_usage'])
mem_plot_q2wavg = wavg_memuse_byacct[wavg_memuse_byacct['quartile'] == '2'][['account','wavg_usage']].set_index('account')
px_dfq2w=mem_plot_q2wavg.sort_values(['wavg_usage'])
mem_plot_q3wavg = wavg_memuse_byacct[wavg_memuse_byacct['quartile'] == '3'][['account','wavg_usage']].set_index('account')
px_dfq3w=mem_plot_q3wavg.sort_values(['wavg_usage'])
mem_plot_q4wavg = wavg_memuse_byacct[wavg_memuse_byacct['quartile'] == '4'][['account','wavg_usage']].set_index('account')
px_dfq4w=mem_plot_q4wavg.sort_values(['wavg_usage'])


mem_plot_wtdavg = weightedavg_memuse_byacct[['account','weightedavg_usage']].set_index('account')
px_dfw=mem_plot_wtdavg.sort_values(['weightedavg_usage'])
px.bar(px_dfw, orientation='h', height = 2000 )

#slice out some short jobs
job_duration_short = memory_use[memory_use['used_hours'] <= 2]
job_duration_short.head().to_csv('../Data/short.csv')

#slice out some longer jobs
job_duration_long = memory_use[(memory_use['used_hours'] >4) & 
                               (memory_use['used_hours'] <8)
                              ]
job_duration_long.head().to_csv('../Data/longer.csv')

#slice out some really long jobs
job_duration_xlong = memory_use[(memory_use['used_hours'] > 72)
                              ]
job_duration_xlong.head().to_csv('../Data/xlong.csv')

In [None]:
#total hours used, for weighting
#ttl_used_hours = mem_use_by_acct['used_hours'].sum()

In [None]:
#add a column for group weighting by used_hours
#mem_use_by_acct['pct_ttl_used_hrs'] = mem_use_by_acct['used_hours'] / ttl_used_hours

In [None]:
#check that the total is 1
#mem_use_by_acct.pct_ttl_used_hrs.sum()

In [None]:
#add a column for weighted avg_usage
#mem_use_by_acct['weighted_avg_usage'] = mem_use_by_acct['avg_usage'] * mem_use_by_acct['pct_ttl_used_hrs']