In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


Read in data and also match PBA code to it's actual name

In [2]:
# ----------------------------------------------------------------------------------- #
## 2012
df = pd.read_csv('Datasets/CBECS2012/2012_public_use_data_aug2016.csv',header=0,index_col=0)
# ----------------------------------------------------------------------------------- #

print("number of sample that reported with server: %d" % df['SERVERN'].count())
print("number of sample that reported with server (not null): %d" % len(df[(df['SERVERN'] != 0) & (df['SERVERN'].notnull())]))

pba_id = pd.read_csv('pba_id.csv',header=0,index_col=0)
pbaplus_id = pd.read_csv('pbaplus_id.csv', header = 0, index_col = 0)
df = df.join(pba_id,on='PBA')
df = df.join(pbaplus_id,on = 'PBAPLUS')

number of sample that reported with server: 6481
number of sample that reported with server (not null): 3549


Categorize each building based on the number of servers/racks

In [3]:
# ----------------------------------------------------------------------------------- #
# Inclusive:
order = ['small','midsize','large']

# Server number by DC size # -- assumptions for 2012 data
defs = [(1,25,'small'),
        (26,499,'midsize'),
        (500,1e5,'large')]
# ----------------------------------------------------------------------------------- #

Do categorization

In [4]:
df['space type'] = np.nan
for tup in defs:
    m = (df['SERVERN'] <= tup[1]) & (df['SERVERN']>= tup[0])
    df.loc[m,'space type'] = tup[2]


# Check to make sure we didn't miss any buildings
df['space type'].dropna().count() == (len(df[(df['SERVERN'] != 0) & (df['SERVERN'].notnull())]))

True

Multiply building weight by the number of racks/servers in each building. 

In [5]:
df['nweight_s'] = df['SERVERN']*df['FINALWT']

Code DATACNTR column as either true or false. Note that specifically refers to whether something is a data center of farm. So data rooms and closets are coded as false.

In [6]:
m_dc = df['DATACNTR'] == 1

df.loc[m_dc,'DATACNTR'] = True
df.loc[~m_dc,'DATACNTR'] = False

Map coded values for server square footage to their actual ranges

In [7]:
data_map = {1:'<= 500 sq. ft.',
            2:'501 to 1500 sq. ft.',
            3:'1501 to 3000 sq. ft.',
            4:'3001 to 10,000 sq. ft.',
            5:'> 10,000 sq. ft.',
            0:np.nan}
df.loc[df['DCNTRSFC'].isnull(),'DCNTRSFC'] = 0
df['data center sq. ft.'] = df.DCNTRSFC.apply(lambda x:data_map[x])

Now, let's replace 9995 values in SERVERN with something more representative based on the expected total number of shipments.

In [8]:
# ----------------------------------------------------------------------------------- #
tot_serv = 13581642.   #From 2012 value.
# ----------------------------------------------------------------------------------- #

m = df['SERVERN'] > defs[2][0] # filter out 'large'/ 'Service Provider'
avg_num_he_s = (tot_serv - df.loc[~m,'nweight_s'].sum())/df.loc[m,'FINALWT'].sum()
print(avg_num_he_s)
df.loc[m,'SERVERN'] = avg_num_he_s
df.loc[m,'nweight_s'] = avg_num_he_s*df.loc[m,'FINALWT']


4559.953872214017


Double check to make sure that the total number of servers matches tot_serv

In [9]:
(df['nweight_s'].sum() == tot_serv) or (abs(df['nweight_s'].sum() - tot_serv) <= 10**(-6))

True

In [10]:
# ----------------------------------------------------------------------------------- #

# df.to_csv('Datasets/cbecs_servers_cleaned_2012_dummy.csv')

# ----------------------------------------------------------------------------------- #