In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.options.display.max_rows = 50

# Functions

In [3]:
def linreg(X, Y):
    """
        Summary
        Linear regression of y = ax + b
        Usage
        real, real, real = linreg(list, list)
        Returns coefficients to the regression line "y=ax+b" from x[] and y[], and R^2 Value
        """
    if len(X) != len(Y):  raise ValueError("unequal length")
    N = len(X)
    Sx = Sy = Sxx = Syy = Sxy = 0.0
    for x, y in zip(X, Y):
        Sx = Sx + x
        Sy = Sy + y
        Sxx = Sxx + x*x
        Syy = Syy + y*y
        Sxy = Sxy + x*y
    det = Sxx * N - Sx * Sx
    a, b = (Sxy * N - Sy * Sx)/det, (Sxx * Sy - Sx * Sxy)/det
    meanerror = residual = 0.0
    for x, y in zip(X, Y):
        meanerror = meanerror + (y - Sy/N)**2
        residual = residual + (y - a * x - b)**2
    RR = 1 - residual/meanerror
    ss = residual / (N-2)
    Var_a, Var_b = ss * N / det, ss * Sxx / det
    return a, b, RR, Var_a, Var_b

In [4]:
def plot_scatter(df, predicted, predictor='', title=''):
    if not title:
        title = '%s vs %s' % (predicted, predictor)
    with plt.style.context('bmh'):
        df.plot(x=predictor, 
                y=predicted, 
                figsize=(14, 8), 
                kind='scatter', 
                title=title)

# Read data

In [5]:
project_dir = '/Users/navaneethan/Documents/projects/lighttime/'
raw_dir = project_dir + 'raw/'
processed_dir = project_dir + 'processed/'

energy_fname = 'energy_usmsa.csv'

In [6]:
energy = pd.read_csv(processed_dir+energy_fname)

In [8]:
energy.sort_values(by='total_pop', ascending=False).head()['total_pop']

429    6684008.0
816    4751351.0
136    3449096.0
179    3276672.0
576    2728685.0
Name: total_pop, dtype: float64

In [None]:
res_com_ind.set_index(keys='gisjoin', inplace=True)

In [None]:
np.arange(0, 1, 0.1)

In [None]:
res_com_ind.sort_values(by='total_pop', ascending=False).head()

In [None]:
res_com_ind.reset_index()['gisjoin'].value_counts(dropna=False)

In [None]:
res_com_ind['total_pop'].describe(percentiles=np.arange(0, 1, 0.05))

In [None]:
res_com_ind.replace([np.inf, -np.inf], value=np.nan, inplace=True)
res_com_ind.dropna(inplace=True)

with plt.style.context('bmh'):
    res_com_ind['total_pop'].plot(kind='hist', figsize=(14, 8))

In [None]:
res_com_ind.head()

In [None]:
for pc in res_com_ind.columns:
    res_com_ind['log_'+pc] = res_com_ind[pc].apply(lambda col: np.log10(col))

In [None]:
res_com_ind.head()

In [None]:
res_com_ind_log = res_com_ind[[col for col in res_com_ind.columns if col.startswith('log_')]].copy()

In [None]:
res_com_ind_log.shape

In [None]:
res_com_ind_log.replace([np.inf, -np.inf], value=np.nan, inplace=True)
res_com_ind_log.dropna(inplace=True)

In [None]:
res_com_ind_log.shape

In [None]:
res_com_ind_log[np.isinf(res_com_ind_log['log_res_gas_mcf'])]

In [None]:
res_com_ind_log.notna().sum()

In [None]:
res_com_ind_log.head()

In [None]:
predictor_cols = [
    'log_housing_units', 
    'log_total_pop',
    'log_com_num_establishments',
    'log_ind_num_establishments'
]

In [None]:
predicted_cols = list(set(res_com_ind_log.columns).difference(predictor_cols))

In [None]:
sorted(predicted_cols)

In [None]:
res_com_ind_log[['log_total_pop', 'log_res_elec_mwh']].describe()

In [None]:
linreg(X=res_com_ind_log['log_total_pop'], Y=res_com_ind_log['log_housing_units'])
plot_scatter(df=res_com_ind_log, predictor='log_total_pop', predicted='log_housing_units')

In [None]:
linreg(X=res_com_ind_log['log_total_pop'], Y=res_com_ind_log['log_com_num_establishments'])
plot_scatter(df=res_com_ind_log, predictor='log_total_pop', predicted='log_com_num_establishments')

In [None]:
linreg(X=res_com_ind_log['log_total_pop'], Y=res_com_ind_log['log_ind_elec_mwh'])

In [None]:
linreg(X=res_com_ind_log['log_ind_num_establishments'], Y=res_com_ind_log['log_ind_elec_mwh'])

In [None]:
linreg(X=res_com_ind_log['log_total_pop'], Y=res_com_ind_log['log_ind_num_establishments'])

In [None]:
linreg(X=res_com_ind_log['log_total_pop'], Y=res_com_ind_log['log_res_elec_mwh'])

In [None]:
linreg(X=res_com_ind_log['log_total_pop'], Y=res_com_ind_log['log_res_gas_mcf'])