# Setup

In [1]:
import pandas as pd
import numpy as np
import random
from load_data import load_xs_data, load_panel_data

from sklearn.decomposition import PCA
from scipy.stats.kde import gaussian_kde

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Define PCA fct

In [2]:
def PCA(data, dim_rescaled_data=None): 
    import numpy as np
    from scipy import linalg as LA
    m, n = data.shape
    data -= data.mean(axis=0)
    data /= data.std(axis=0)
    R = np.cov(data, rowvar=False)
    evals, evecs = LA.eigh(R)
    idx = np.argsort(evals)[::-1]
    evecs = evecs[:, idx]
    evals = evals[idx]
    if dim_rescaled_data is not None: 
        evecs = evecs[:, :dim_rescaled_data]
    return np.dot(evecs.T, data.T).T, evals, evecs

# Simulate latent factor

## Latent factor

In [3]:
factor = pd.Series(index=range(0, 1000))
for ix in factor.index: 
    factor.loc[ix] = random.uniform(0, 100)

## Simulate data

In [4]:
A = [-2, 1, 100, 20, 5, 7]
data = pd.DataFrame(index=factor.index, columns=['lhs', 'rhs1', 'rhs2', 'rhs3', 'rhs4', 'rhs5', 'pc1'])
for ix, cx in enumerate(data.drop('pc1', axis=1).columns): 
    data.loc[:, cx] = factor * A[ix]
    for nx in data.index: 
        data.loc[nx, cx] += random.gauss(0, A[ix])

### Determine 1st PC

In [5]:
data['pc1'], _, scores = PCA(data[list(data.drop(['lhs', 'pc1'], axis=1).columns)].values)

In [6]:
data['pc1'] *= np.sign(scores[0, 0])

## Run regression

In [8]:
cases = {
    'case #1': list(data.drop(['lhs', 'pc1'], axis=1).columns), 
    'case #2': ['rhs1'], 
    'case #3': ['rhs2'], 
    'case #4': ['rhs3'], 
    'case #5': ['rhs4'], 
    'case #6': ['rhs5'], 
    'case #7': ['pc1']
}

In [9]:
results_table = pd.DataFrame(
    data='',
    index=pd.MultiIndex.from_product([list(data.drop('lhs', axis=1).columns), 
                                      ['coefficient', 'standard error']],
                                     names=['variables', 'info']),
    columns=list(cases.keys()))

In [10]:
import statsmodels.api as sm
idx = pd.IndexSlice

  from pandas.core import datetools


In [13]:
for case, rhs_variables in cases.items(): 
    regression = sm.OLS(data[['lhs']].values, data[rhs_variables].values).fit()
    for ix, rhs_var in enumerate(rhs_variables): 
        if np.abs(regression.tvalues[ix]) > 2.33: 
            results_table.loc[idx[rhs_var, 'coefficient'], case] = '{:,.2f}***'.format(regression.params[ix])
        elif np.abs(regression.tvalues[ix]) > 1.96 and np.abs(regression.tvalues[ix]) <= 2.33: 
            results_table.loc[idx[rhs_var, 'coefficient'], case] = '{:,.2f}**'.format(regression.params[ix])
        elif np.abs(regression.tvalues[ix]) > 1.645 and np.abs(regression.tvalues[ix]) <= 1.96: 
            results_table.loc[idx[rhs_var, 'coefficient'], case] = '{:,.2f}*'.format(regression.params[ix])
        else: 
            results_table.loc[idx[rhs_var, 'coefficient'], case] = '{:,.2f}'.format(regression.params[ix])
        results_table.loc[idx[rhs_var, 'standard error'], case] = '({:,.2f})'.format(regression.bse[ix])

In [14]:
results_table

Unnamed: 0_level_0,Unnamed: 1_level_0,case #1,case #2,case #3,case #4,case #5,case #6,case #7
variables,info,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rhs1,coefficient,-0.36***,-2.00***,,,,,
rhs1,standard error,(0.06),(0.00),,,,,
rhs2,coefficient,-0.00***,,-0.02***,,,,
rhs2,standard error,(0.00),,(0.00),,,,
rhs3,coefficient,-0.02***,,,-0.10***,,,
rhs3,standard error,(0.00),,,(0.00),,,
rhs4,coefficient,-0.08***,,,,-0.40***,,
rhs4,standard error,(0.01),,,,(0.00),,
rhs5,coefficient,-0.06***,,,,,-0.29***,
rhs5,standard error,(0.01),,,,,(0.00),
