In [31]:
from __future__ import print_function, division, unicode_literals
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import bootstrap_utils as bu
import pandas as pd
%matplotlib inline

# bigplanet Imports
from bigplanet import data_extraction as de
from bigplanet import big_ml as bml

# Flags to control functionality
make_new_features = True
sklearn_fit = True
verbose = False

data_loc = "../Data"
df = de.aggregate_data(cache=os.path.join(data_loc,"oct_cache_3sig.pkl"))

Reading data from cache: ../Data/oct_cache_3sig.pkl


In [32]:
df[:10]

Unnamed: 0,b_ArgP,b_Ecce,b_Inc,b_LongA,b_Obli,b_RotPer,b_Semim,b_SurfEnFluxEqtide,b_Time,c_ArgP,c_Ecce,c_Inc,c_LongA,c_Time,star_RotPer,star_Time,b_Ecc_Prob,b_Prob,b_Prob_Frac,b_Semi_Prob
0,228.567703,0.008694,6.828153,285.934937,142.994492,1,0.065404,601.379333,0,358.94931,0.215492,3.53881,105.938454,0,83,0,1,0,0.0,0
1,344.377838,0.248188,4.874215,27.626932,131.948273,1,0.055957,244.057358,0,119.007538,0.149377,1.035888,207.652969,0,83,0,1,1,1.0,1
2,92.495224,0.234157,11.132176,166.647781,76.773628,1,0.060348,37.295586,0,30.063465,0.167904,1.503826,346.642883,0,83,0,1,1,1.0,1
3,359.955505,0.33699,11.15999,155.874954,89.419228,1,0.068124,8.883687,0,156.443817,0.305641,6.981581,335.878845,0,83,0,1,0,0.0,0
4,39.886032,0.132845,1.261827,123.406296,88.46286,1,0.058306,299.277252,0,339.44751,0.138724,5.257622,303.403351,0,83,0,1,1,1.0,1
5,122.628563,0.207033,10.706461,223.839401,54.097195,1,0.063623,127.036362,0,251.439407,0.220054,2.039592,43.855385,0,83,0,1,1,0.819339,1
6,97.881447,0.210864,7.676245,241.457428,88.310143,1,0.070746,64.406525,0,205.160843,0.119601,3.018406,61.459843,0,83,0,1,0,0.0,0
7,131.992844,0.104855,15.745926,191.358505,144.244568,1,0.057739,441.384003,0,98.996452,0.250974,7.850578,11.347752,0,83,0,1,1,1.0,1
8,274.558197,0.34244,14.496427,39.190773,108.370056,1,0.061679,102.051262,0,87.160019,0.148541,1.135495,219.197449,0,83,0,1,0,0.0,0
9,193.0961,0.065192,19.059734,84.568619,95.62886,1,0.060717,203.68541,0,80.535759,0.008298,2.794156,264.535126,0,83,0,1,1,1.0,1


### Make some new (physically motivated) features

In [33]:
# Transform ArgP, LongA to sin of those quantities to handle edge cases
# for all bodies
df["b_sinArgP"] = pd.Series(np.sin(df["b_ArgP"]), index=df.index)
df["c_sinArgP"] = pd.Series(np.sin(df["c_ArgP"]), index=df.index)
df["b_sinLongA"] = pd.Series(np.sin(df["b_LongA"]), index=df.index)
df["c_sinLongA"] = pd.Series(np.sin(df["c_LongA"]), index=df.index)

# Add Delauney variables DelG, DelH defined as follows:
# DelG ~ sqrt(1 - e^2)
# DelH ~ sqrt(1 - e^2)cosi
# Note I drop other constants as they just scale it
df["b_DelG"] = pd.Series(np.sqrt(1.0 - df["b_Ecce"]**2), index=df.index)
df["c_DelG"] = pd.Series(np.sqrt(1.0 - df["c_Ecce"]**2), index=df.index)
df["b_DelH"] = pd.Series(np.cos(df["b_Inc"])*np.sqrt(1.0 - df["b_Ecce"]**2), index=df.index)
df["c_DelH"] = pd.Series(np.cos(df["c_Inc"])*np.sqrt(1.0 - df["c_Ecce"]**2), index=df.index)

# Add mixed Delauney variables like sqrt(1 - e_b^2)cos(i_c)
df["bc_DelH"] = pd.Series(np.cos(df["b_Inc"])*np.sqrt(1.0 - df["c_Ecce"]**2), index=df.index)
df["cb_DelH"] = pd.Series(np.cos(df["c_Inc"])*np.sqrt(1.0 - df["b_Ecce"]**2), index=df.index)

# Add differences of variables, like abs diff between angles, ecc, inc, so on
df["Abs_Ecce_Diff"] = pd.Series(np.fabs(df["b_Ecce"] - df["c_Ecce"]), index=df.index)
df["Abs_Inc_Diff"] = pd.Series(np.fabs(df["b_Inc"] - df["c_Inc"]), index=df.index)
df["Abs_LongA_Diff"] = pd.Series(np.fabs(df["b_LongA"] - df["c_LongA"]), index=df.index)
df["Abs_ArgP_Diff"] = pd.Series(np.fabs(df["b_ArgP"] - df["c_ArgP"]), index=df.index)

In [34]:
df[:5]

Unnamed: 0,b_ArgP,b_Ecce,b_Inc,b_LongA,b_Obli,b_RotPer,b_Semim,b_SurfEnFluxEqtide,b_Time,c_ArgP,...,b_DelG,c_DelG,b_DelH,c_DelH,bc_DelH,cb_DelH,Abs_Ecce_Diff,Abs_Inc_Diff,Abs_LongA_Diff,Abs_ArgP_Diff
0,228.567703,0.008694,6.828153,285.934937,142.994492,1,0.065404,601.379333,0,358.94931,...,0.999962,0.976506,0.855112,-0.900476,0.835053,-0.922106,0.206798,3.289343,179.996483,130.381607
1,344.377838,0.248188,4.874215,27.626932,131.948273,1,0.055957,244.057358,0,119.007538,...,0.968712,0.98878,0.15608,0.504043,0.159313,0.493813,0.098811,3.838327,180.026037,225.3703
2,92.495224,0.234157,11.132176,166.647781,76.773628,1,0.060348,37.295586,0,30.063465,...,0.972199,0.985803,0.132392,0.06597,0.134244,0.06506,0.066253,9.62835,179.995102,62.431759
3,359.955505,0.33699,11.15999,155.874954,89.419228,1,0.068124,8.883687,0,156.443817,...,0.941508,0.952147,0.154103,0.729225,0.155844,0.721077,0.031349,4.178409,180.003891,203.511688
4,39.886032,0.132845,1.261827,123.406296,88.46286,1,0.058306,299.277252,0,339.44751,...,0.991137,0.990331,0.301382,0.513603,0.301137,0.514021,0.005879,3.995795,179.997055,299.561478


In [35]:
# Import models
# LinearRegression == OLS
from sklearn.linear_model import LinearRegression

### Convert the data into numpy arrays to play nice with sklearn

Also remove some useless features and NaNs

In [36]:
# Create data, filter NaNs
features = ['b_Ecce', 'b_Inc','b_Semim','c_Ecce','c_Inc','b_sinArgP',
        'c_sinArgP', 'b_sinLongA', 'c_sinLongA', 'b_DelG', 'c_DelG','b_DelH','c_DelH',
        'bc_DelH','cb_DelH','Abs_Ecce_Diff','Abs_Inc_Diff','Abs_LongA_Diff','Abs_ArgP_Diff']
target = "b_Prob_Frac"

X, y, names = bml.extract_features(df, features, target)

In [37]:


# Create data, filter NaNs
features = ['b_Ecce', 'b_Inc','b_Semim','c_Ecce','c_Inc','b_sinArgP',
        'c_sinArgP', 'b_sinLongA', 'c_sinLongA', 'b_DelG', 'c_DelG','b_DelH','c_DelH',
        'bc_DelH','cb_DelH','Abs_Ecce_Diff','Abs_Inc_Diff','Abs_LongA_Diff','Abs_ArgP_Diff']
target = "b_Prob_Frac"

# turns them into arrays
X, y, names = bml.extract_features(df, features, target)

# Scale data to have 0 mean, unit standard deviation
X_scaled = bml.scale_data(X)

In [38]:
# Bootstrapping Parameters
nboots = 100
seed = 1

# Use simple linear model
est = LinearRegression()

fit_mean, fit_std = bu.bootstrap_error_estimate(est, X_scaled, y, nboots=nboots, seed=seed)
print(r"R^2 = :",est.score(X_scaled,y))
plot = False
if plot:
    # Project into 2d space
    fig, ax = plt.subplots()

    xind = 1
    yind = 2

    cax = ax.scatter(X[:,xind],X[:,yind], c=fit_std, edgecolor="none")
    cbar = fig.colorbar(cax)

    # Format
    #ax.set_xlabel("c Eccentricity")
    #ax.set_ylabel("c Inclination [degrees]")
    ax.set_xlim(X[:,xind].min(),X[:,xind].max())
    ax.set_ylim(X[:,yind].min(),X[:,yind].max())

    # fig.tight_layout()

    plt.show()

R^2 = : 0.572619899178


In [39]:
est = ()

fit_mean, fit_std = bu.bootstrap_error_estimate(est, X_scaled, y, nboots=nboots, seed=seed)
print(r"R^2 = :",est.score(X_scaled,y))

AttributeError: 'tuple' object has no attribute 'fit'

In [43]:
col_names = ['est','training MSE','testing MSE', r'training $R^2$', r'testing $R^2$', 'Mean std']
row_names = ['OLS','Ridge']

AssertionError: 6 columns passed, passed data had 5 columns

In [51]:
data = [['OLS',1, 1, 1, 1, 1],['RR', 1, 1, 1, 1, 1]]

In [53]:
table = pd.DataFrame(data=data, columns=col_names)

In [55]:
table.to_latex()

u'\\begin{tabular}{llrrrrr}\n\\toprule\n{} &  est &  training MSE &  testing MSE &  training \\$R\\textasciicircum2\\$ &  testing \\$R\\textasciicircum2\\$ &  Mean std \\\\\n\\midrule\n0 &  OLS &             1 &            1 &               1 &              1 &         1 \\\\\n1 &   RR &             1 &            1 &               1 &              1 &         1 \\\\\n\\bottomrule\n\\end{tabular}\n'