In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline


Read in data and also match PBA code to it's actual name

In [4]:
# ----------------------------------------------------------------------------------- #
## 2018
df = pd.read_csv('Datasets/CBECS2018/cbecs2018_final_public.csv',header=0,index_col=0)
df['RACKN'] = df['SERVERN'].copy()
df.loc[df['SRVUNIT'] == 2, 'SERVERN'] = float('nan')
df.loc[df['SRVUNIT'] == 1, 'RACKN'] = float('nan')
# ----------------------------------------------------------------------------------- #

print("number of sample that reported with rack: %d" % df['RACKN'].count())
print("number of sample that reported with server: %d" % df['SERVERN'].count())

pba_id = pd.read_csv('pba_id.csv',header=0,index_col=0)
pbaplus_id = pd.read_csv('pbaplus_id.csv', header = 0, index_col = 0)
df = df.join(pba_id,on='PBA')
df = df.join(pbaplus_id,on = 'PBAPLUS')

number of sample that reported with rack: 1715
number of sample that reported with server: 1918


Categorize each building based on the number of servers/racks

In [5]:
# ----------------------------------------------------------------------------------- #
# Inclusive:
order = type = ['small','midsize','large']

# Server number by DC size # -- assumptions for 2012 data
defs = [(1,25,'small'),
        (26,499,'midsize'),
        (500,1e5,'large')]

# Server number per rack # 
avg_servers_per_rack = {'small': 0.75,
                    'midsize': 3, 
                    'large': 10}

# Rack number
tmp = []
for l, h, t in defs:
    tmp.append([l/avg_servers_per_rack[t], h/avg_servers_per_rack[t], t])

tmp[2][0] += 0.0001
defs2 = []
for i in range(len(tmp)):
    if i == 0:
        l, h = min(tmp[i][0], 1), math.floor(min(tmp[i][1], tmp[i+1][0]))
        defs2.append([l, h, tmp[i][2]])
    elif i == len(tmp) - 1:
        l, h = math.ceil(max(tmp[i][0], tmp[i-1][1])), tmp[i][1]
        defs2.append([l, h, tmp[i][2]])
    else:
        l1, h1 = math.ceil(min(tmp[i][0], tmp[i-1][1])), math.floor(max(tmp[i][0], tmp[i-1][1]))
        defs2.append([l1, h1,  tmp[i-1][2] + "/" + tmp[i][2]])

        l, h = math.ceil(max(tmp[i-1][1], tmp[i][0])), math.floor(min(tmp[i+1][0], tmp[i][1]))
        defs2.append([l, h, tmp[i][2]])

        l2, h2 = math.ceil(min(tmp[i][1], tmp[i+1][0])), math.floor(max(tmp[i][1], tmp[i+1][0]))
        defs2.append([l2, h2,  tmp[i][2] + "/" + tmp[i+1][2]])
# ----------------------------------------------------------------------------------- #

In [6]:
print('Server number by DC size: {}'.format(defs))
print('Rack number by DC size: {}'.format(defs2))
print('avg_servers_per_rack: {}'.format(avg_servers_per_rack))


Server number by DC size: [(1, 25, 'small'), (26, 499, 'midsize'), (500, 100000.0, 'large')]
Rack number by DC size: [[1, 8, 'small'], [9, 33, 'small/midsize'], [34, 50, 'midsize'], [51, 166, 'midsize/large'], [167, 10000.0, 'large']]
avg_servers_per_rack: {'small': 0.75, 'midsize': 3, 'large': 10}


Do categorization

In [7]:
df['space type'] = np.nan
for tup in defs:
    m = (df['SERVERN'] <= tup[1]) & (df['SERVERN']>= tup[0])
    df.loc[m,'space type'] = tup[2]

for tup in defs2:
    m = (df['RACKN'] <= tup[1]) & (df['RACKN']>= tup[0])
    df.loc[m,'space type'] = tup[2]

# Check to make sure we didn't miss any buildings
df['space type'].dropna().count() == (df['RACKN'].count() + df['SERVERN'].count())

True

In [8]:
print("Ambigous classifications: ",  len(df[df['space type'].isin({'midsize/large', 'small/midsize'})]))

# Further classify ambigous 'space type' based on "Data center sqft category" column 
df.loc[df['space type'].isin({'small/midsize'}) & df['DCNTRSFC'] == 1, 'space type'] = 'small' # 1=500 square feet or less
df.loc[df['space type'].isin({'midsize/large', 'small/midsize'}) & df['DCNTRSFC'] == 3, 'space type'] = 'midsize' # 3=1,501 to 3,000 square feet
df.loc[df['space type'].isin({'midsize/large', 'small/midsize'}) & df['DCNTRSFC'] == 4, 'space type'] = 'midsize' # 4=3,001 to 10,000 square feet

print("Ambigous classifications: ", + len(df[df['space type'].isin({'midsize/large', 'small/midsize'})]))

# Further classify ambigous 'space type' based on "Server closet" column: if survey indicate this sample to be "Server closet", classify as small
df.loc[df['space type'].isin({'small/midsize'}) & df['SRVRCLST'] == 1, 'space type'] = 'small'

print("Ambigous classifications: ", + len(df[df['space type'].isin({'midsize/large', 'small/midsize'})]))
# Further classify ambigous 'space type' based on "SQFT" column: calculate the mapping of 'space type' to average 'SQFT' based on data that is not ambigous, then assign the classification based on cloest distance
def map_space_type_mid_or_large(sqft):
    mapping = df[~df['space type'].isin({'midsize/large', 'small/midsize'})].groupby('space type')['SQFT'].mean().to_dict()
    del mapping['small']
    closest_category = min(mapping, key=lambda x: abs(mapping[x] - sqft))
    return closest_category

df.loc[df['space type'].isin({'midsize/large'}), 'space type'] = df.loc[df['space type'].isin({'midsize/large'}), 'SQFT'].apply(map_space_type_mid_or_large)

print("Ambigous classifications: ", + len(df[df['space type'].isin({'midsize/large', 'small/midsize'})]))


Ambigous classifications:  493
Ambigous classifications:  306
Ambigous classifications:  67
Ambigous classifications:  0


For samples reported with rack, get server number by rack number * avg_servers_per_rack;
Similarly for samples reported with server;

In [9]:
df.loc[df['SRVUNIT'] == 2, 'SERVERN'] = df.loc[df['SRVUNIT'] == 2, 'RACKN'] * df.loc[df['SRVUNIT'] == 2, 'space type'].map(avg_servers_per_rack)

In [10]:
df.loc[df['SRVUNIT'] == 1, 'RACKN'] = df.loc[df['SRVUNIT'] == 1, 'SERVERN'] / df.loc[df['SRVUNIT'] == 1, 'space type'].map(avg_servers_per_rack)

In [11]:
print("number of server samples after imputation: %d" % df['SERVERN'].count())
print("number of rack samples after imputation: %d" % df['RACKN'].count())

number of server samples after imputation: 3633
number of rack samples after imputation: 3633


Multiply building weight by the number of racks/servers in each building. 

In [12]:
df['nweight_s'] = df['SERVERN']*df['FINALWT']
df['nweight_r'] = df['RACKN']*df['FINALWT']

Code DATACNTR column as either true or false. Note that specifically refers to whether something is a data center of farm. So data rooms and closets are coded as false.

In [13]:
m_dc = df['DATACNTR'] == 1

df.loc[m_dc,'DATACNTR'] = True
df.loc[~m_dc,'DATACNTR'] = False

Map coded values for server square footage to their actual ranges

In [14]:
data_map = {1:'<= 500 sq. ft.',
            2:'501 to 1500 sq. ft.',
            3:'1501 to 3000 sq. ft.',
            4:'3001 to 10,000 sq. ft.',
            5:'> 10,000 sq. ft.',
            0:np.nan}
df.loc[df['DCNTRSFC'].isnull(),'DCNTRSFC'] = 0
df['data center sq. ft.'] = df.DCNTRSFC.apply(lambda x:data_map[x])

Now, let's replace 9995 values in SERVERN with something more representative based on the expected total number of shipments.

In [15]:
# ----------------------------------------------------------------------------------- #
# tot_serv = 13581642.   #From 2012 value.
tot_serv = 16832576.   #From 2018 value: the calculation below not appliable to 2018 data (due to server rack, leave it here for later modification/extended analysis)
# ----------------------------------------------------------------------------------- #

m = df['SERVERN'] > defs[2][0] # filter out 'large'/ 'Service Provider'
avg_num_he_s = (tot_serv - df.loc[~m,'nweight_s'].sum())/df.loc[m,'FINALWT'].sum()
print(avg_num_he_s)
df.loc[m,'SERVERN'] = avg_num_he_s
df.loc[m,'nweight_s'] = avg_num_he_s*df.loc[m,'FINALWT']


5323.79191542552


In [16]:
# Similarly, do this for rack column
df.loc[m,'RACKN'] = avg_num_he_s/avg_servers_per_rack[defs[2][2]]
df.loc[m,'nweight_r'] = (avg_num_he_s/avg_servers_per_rack[defs[2][2]])*df.loc[m,'FINALWT']

Double check to make sure that the total number of servers matches tot_serv

In [17]:
(df['nweight_s'].sum() == tot_serv) or (abs(df['nweight_s'].sum() - tot_serv) <= 10**(-6))

True

In [18]:
df['nweight_s'].sum() == (df['nweight_r'] * df['space type'].map(avg_servers_per_rack)).sum() or (abs(df['nweight_s'].sum() - (df['nweight_r'] * df['space type'].map(avg_servers_per_rack)).sum()) <= 10**(-6))

True

In [19]:
# ----------------------------------------------------------------------------------- #

# df.to_csv('Datasets/cbecs_servers_cleaned_2018_dummy.csv')

# ----------------------------------------------------------------------------------- #