# Feature Engineering

Baseline: 2.25/

## Ideas
- polynomial 2 - capture interaction features

In [1]:
import time
import sys

import pandas as pd
from tqdm import tqdm
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster, progress
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

## Feature Engineering

In [2]:
def haversine_plus(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    #Convert decimal degrees to Radians:
    lon1 = np.radians(lon1)
    lat1 = np.radians(lat1)
    lon2 = np.radians(lon2)
    lat2 = np.radians(lat2)

    #Implementing Haversine Formula: 
    dlon = np.subtract(lon2, lon1)
    dlat = np.subtract(lat2, lat1)

    a = np.add(np.power(np.sin(np.divide(dlat, 2)), 2),  
                          np.multiply(np.cos(lat1), 
                                      np.multiply(np.cos(lat2), 
                                                  np.power(np.sin(np.divide(dlon, 2)), 2))))
    
    haversine = np.multiply(2, np.arcsin(np.sqrt(a)))
    latlon1 = np.subtract(np.multiply(lon1, lat1), np.multiply(lon2, lat2))
    
    return haversine, latlon1


def generate_meta_features(df_tmp):
    '''Function to engineer meta features
    '''
    df_tmp['haversine'], df_tmp['latlon1'] = haversine_plus(df_tmp['ra'].values,
                                                    df_tmp['decl'].values,
                                                    df_tmp['gal_l'].values,
                                                    df_tmp['gal_b'].values)
    
    return df_tmp


def generate_features(df_tmp, df_out):
    '''Function to engineer features
    '''
    # Generate Flux Features
    df_tmp['flux_ratio_sq'] = np.power(df_tmp['flux'] / df_tmp['flux_err'], 2.0)
    df_tmp['flux_by_flux_ratio_sq'] = df_tmp['flux'] * df_tmp['flux_ratio_sq']
    
    '''
    flux & flux_err - by object_id
    '''
    ## By object_id
    flux_mean = df_tmp.groupby(['object_id'])['flux'].mean()  # Mean (NOTE FIGURE OUT HOW TO ADD '_MEAN' to DF)
    flux_median = df_tmp.groupby(['object_id'])['flux'].median() # Median
    flux_std = df_tmp.groupby(['object_id'])['flux'].std()  # Std. Dev.
    flux_max = df_tmp.groupby(['object_id'])['flux'].max()  # Max
    flux_min = df_tmp.groupby(['object_id'])['flux'].min()  # Min
    flux_skew = df_tmp.groupby(['object_id'])['flux'].skew()  # Skew
    flux_kurtosis = df_tmp.groupby(['object_id'])['flux'].apply(pd.DataFrame.kurtosis)  # Kurtosis
    # Flux Err
    flux_err_mean = df_tmp.groupby(['object_id'])['flux_err'].mean()  # Mean (NOTE FIGURE OUT HOW TO ADD '_MEAN' to DF)
    flux_err_median = df_tmp.groupby(['object_id'])['flux_err'].median()  # Median
    flux_err_std = df_tmp.groupby(['object_id'])['flux_err'].std()  # Std. Dev
    flux_err_max = df_tmp.groupby(['object_id'])['flux_err'].max()  # Max
    flux_err_min = df_tmp.groupby(['object_id'])['flux_err'].min()  # Min
    flux_err_skew = df_tmp.groupby(['object_id'])['flux_err'].skew()  # Skew
    flux_err_kurtosis = df_tmp.groupby(['object_id'])['flux_err'].apply(pd.DataFrame.kurtosis)  # Kurtosis
    
    df_out = df_out.join(flux_mean, on='object_id', how='inner', rsuffix='_mean')
    df_out = df_out.join(flux_median, on='object_id', how='inner', rsuffix='_median')
    df_out = df_out.join(flux_std, on='object_id', how='inner', rsuffix='_std')
    df_out = df_out.join(flux_max, on='object_id', how='inner', rsuffix='_max')
    df_out = df_out.join(flux_min, on='object_id', how='inner', rsuffix='_min')
    df_out = df_out.join(flux_skew, on='object_id', how='inner', rsuffix='_skew')
    df_out = df_out.join(flux_kurtosis, on='object_id', how='inner', rsuffix='_kurtosis')

    df_out = df_out.join(flux_err_mean, on='object_id', how='inner', rsuffix='_mean')
    df_out = df_out.join(flux_err_median, on='object_id', how='inner', rsuffix='_median')
    df_out = df_out.join(flux_err_std, on='object_id', how='inner', rsuffix='_std')
    df_out = df_out.join(flux_err_max, on='object_id', how='inner', rsuffix='_max')
    df_out = df_out.join(flux_err_min, on='object_id', how='inner', rsuffix='_min')
    df_out = df_out.join(flux_err_skew, on='object_id', how='inner', rsuffix='_skew')
    df_out = df_out.join(flux_err_kurtosis, on='object_id', how='inner', rsuffix='_kurtosis')

    '''
    flux & flux_err - by object_id, then by passband
    '''
    # Flux
    flux_mean = df_tmp.groupby(['object_id', 'passband'])['flux'].mean().unstack(level='passband').add_suffix('_mean')  # Mean
    flux_median = df_tmp.groupby(['object_id', 'passband'])['flux'].median().unstack(level='passband').add_suffix('_median')  # Median
    flux_std = df_tmp.groupby(['object_id', 'passband'])['flux'].std().unstack(level='passband').add_suffix('_std')  # Std. Dev.
    flux_max = df_tmp.groupby(['object_id', 'passband'])['flux'].max().unstack(level='passband').add_suffix('_max')  # Max
    flux_min = df_tmp.groupby(['object_id', 'passband'])['flux'].min().unstack(level='passband').add_suffix('_min')  # Min
    flux_skew = df_tmp.groupby(['object_id', 'passband'])['flux'].skew().unstack(level='passband').add_suffix('_skew')  # Skew
    flux_kurtosis = df_tmp.groupby(['object_id', 'passband'])['flux'].apply(pd.DataFrame.kurtosis).unstack(level='passband').add_suffix('_kurtosis')  # Kurtosis
    # Flux Err
    flux_err_mean = df_tmp.groupby(['object_id', 'passband'])['flux_err'].mean().unstack(level='passband').add_suffix('_err_mean')  # Mean
    flux_err_median = df_tmp.groupby(['object_id', 'passband'])['flux_err'].median().unstack(level='passband').add_suffix('_err_median')  # Median
    flux_err_std = df_tmp.groupby(['object_id', 'passband'])['flux_err'].std().unstack(level='passband').add_suffix('_err_std')  # Std. Dev
    flux_err_max = df_tmp.groupby(['object_id', 'passband'])['flux_err'].max().unstack(level='passband').add_suffix('_err_max')  # Max
    flux_err_min = df_tmp.groupby(['object_id', 'passband'])['flux_err'].min().unstack(level='passband').add_suffix('_err_min')  # Min
    flux_err_skew = df_tmp.groupby(['object_id', 'passband'])['flux_err'].skew().unstack(level='passband').add_suffix('_err_skew')  # Skew
    flux_err_kurtosis = df_tmp.groupby(['object_id', 'passband'])['flux_err'].apply(pd.DataFrame.kurtosis).unstack(level='passband').add_suffix('_err_kurtosis')  # Kurtosis

    df_out = df_out.join(flux_mean, on='object_id', how='inner')
    df_out = df_out.join(flux_median, on='object_id', how='inner')
    df_out = df_out.join(flux_std, on='object_id', how='inner')
    df_out = df_out.join(flux_max, on='object_id', how='inner')
    df_out = df_out.join(flux_min, on='object_id', how='inner')
    df_out = df_out.join(flux_skew, on='object_id', how='inner')
    df_out = df_out.join(flux_kurtosis, on='object_id', how='inner')

    df_out = df_out.join(flux_err_mean, on='object_id', how='inner')
    df_out = df_out.join(flux_err_median, on='object_id', how='inner')
    df_out = df_out.join(flux_err_std, on='object_id', how='inner')
    df_out = df_out.join(flux_err_max, on='object_id', how='inner')
    df_out = df_out.join(flux_err_min, on='object_id', how='inner')
    df_out = df_out.join(flux_err_skew, on='object_id', how='inner')
    df_out = df_out.join(flux_err_kurtosis, on='object_id', how='inner')

    '''
    detected - by object_id
    '''
    detected_mean = df_tmp.groupby(['object_id'])['detected'].mean()  # Mean (NOTE FIGURE OUT HOW TO ADD '_MEAN' to DF)
    
    df_out = df_out.join(detected_mean, on='object_id', how='inner', rsuffix='_mean')
        
        
    '''
    flux_ratio_sq - by object_id
    '''
    ## By object_id
    flux_rs_mean = df_tmp.groupby(['object_id'])['flux_ratio_sq'].mean()  # Mean (NOTE FIGURE OUT HOW TO ADD '_MEAN' to DF)
    flux_rs_median = df_tmp.groupby(['object_id'])['flux_ratio_sq'].median() # Median
    flux_rs_std = df_tmp.groupby(['object_id'])['flux_ratio_sq'].std()  # Std. Dev.
    flux_rs_max = df_tmp.groupby(['object_id'])['flux_ratio_sq'].max()  # Max
    flux_rs_min = df_tmp.groupby(['object_id'])['flux_ratio_sq'].min()  # Min
    flux_rs_skew = df_tmp.groupby(['object_id'])['flux_ratio_sq'].skew()  # Skew
    flux_rs_kurtosis = df_tmp.groupby(['object_id'])['flux_ratio_sq'].apply(pd.DataFrame.kurtosis)  # Kurtosis
    flux_rs_sum = df_tmp.groupby(['object_id'])['flux_ratio_sq'].sum()  # Sum
    
    df_out = df_out.join(flux_rs_mean, on='object_id', how='inner', rsuffix='_mean')
    df_out = df_out.join(flux_rs_median, on='object_id', how='inner', rsuffix='_median')
    df_out = df_out.join(flux_rs_std, on='object_id', how='inner', rsuffix='_std')
    df_out = df_out.join(flux_rs_max, on='object_id', how='inner', rsuffix='_max')
    df_out = df_out.join(flux_rs_min, on='object_id', how='inner', rsuffix='_min')
    df_out = df_out.join(flux_rs_skew, on='object_id', how='inner', rsuffix='_skew')
    df_out = df_out.join(flux_rs_kurtosis, on='object_id', how='inner', rsuffix='_kurtosis')
    df_out = df_out.join(flux_rs_sum, on='object_id', how='inner', rsuffix='_sum')
    
    '''
    flux_by_flux_ratio_sq - by object_id
    '''
    ## By object_id
    flux_bfrs_mean = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].mean()  # Mean (NOTE FIGURE OUT HOW TO ADD '_MEAN' to DF)
    flux_bfrs_median = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].median() # Median
    flux_bfrs_std = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].std()  # Std. Dev.
    flux_bfrs_max = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].max()  # Max
    flux_bfrs_min = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].min()  # Min
    flux_bfrs_skew = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].skew()  # Skew
    flux_bfrs_kurtosis = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].apply(pd.DataFrame.kurtosis)  # Kurtosis
    flux_bfrs_sum = df_tmp.groupby(['object_id'])['flux_by_flux_ratio_sq'].sum()  # Sum
    
    df_out = df_out.join(flux_bfrs_mean, on='object_id', how='inner', rsuffix='_mean')
    df_out = df_out.join(flux_bfrs_median, on='object_id', how='inner', rsuffix='_median')
    df_out = df_out.join(flux_bfrs_std, on='object_id', how='inner', rsuffix='_std')
    df_out = df_out.join(flux_bfrs_max, on='object_id', how='inner', rsuffix='_max')
    df_out = df_out.join(flux_bfrs_min, on='object_id', how='inner', rsuffix='_min')
    df_out = df_out.join(flux_bfrs_skew, on='object_id', how='inner', rsuffix='_skew')
    df_out = df_out.join(flux_bfrs_kurtosis, on='object_id', how='inner', rsuffix='_kurtosis')
    df_out = df_out.join(flux_bfrs_sum, on='object_id', how='inner', rsuffix='_sum')
    
    '''
    Diff Features
    '''
    df_out['flux_diff'] = df_out['flux_max'] - df_out['flux_min']
    df_out['flux_diff_mean'] = (df_out['flux_max'] - df_out['flux_min'])/df_out['flux']
    df_out['flux_w_mean'] = df_out['flux_by_flux_ratio_sq_sum'] / df_out['flux_ratio_sq_sum']
    df_out['flux_diff_w_mean'] = (df_out['flux_max'] - df_out['flux_min'])/df_out['flux_w_mean']
    
    '''
    Single Features
    '''
    # mjd_det_diff to separate "one event" objects as supernovae from "cyclic event" objects as cepheids.
    df_det = df_tmp[df_tmp['detected']==1].copy()
    gr_mjd = df_det.groupby('object_id').mjd
    df_out['mjd_det_diff']  = gr_mjd.transform('max') - gr_mjd.transform('min')
    df_out['mjd_det_diff'] = df_out['mjd_det_diff'].fillna(1000)  # 1000 is arbitrary number for not detected
    
    
    # Interpret NaN to be 0
    for c in df_out.columns:
        df_out[c] = df_out[c].fillna(0)
    
    df_out['object_id'] = df_out['object_id'].astype(int)
    
    return df_out

## Train

In [3]:
%%time

df = pd.read_csv('../data/raw/training_set.csv')  # 1 gigs
df_meta = pd.read_csv('../data/cleaned/training_meta_clean.csv')

df_meta = generate_meta_features(df_meta)
df_train = generate_features(df, df_meta)

df_train.to_csv('../data/interim/training_fe.csv', index=False)

print(df_train.columns)
print(df_train.iloc[0, :])

Index(['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'distmod', 'mwebv',
       'target', 'hostgal_z',
       ...
       'flux_by_flux_ratio_sq_max', 'flux_by_flux_ratio_sq_min',
       'flux_by_flux_ratio_sq_skew', 'flux_by_flux_ratio_sq_kurtosis',
       'flux_by_flux_ratio_sq_sum', 'flux_diff', 'flux_diff_mean',
       'flux_w_mean', 'flux_diff_w_mean', 'mjd_det_diff'],
      dtype='object', length=132)
object_id                         6.150000e+02
ra                                3.490461e+02
decl                             -6.194384e+01
gal_l                             3.207965e+02
gal_b                            -5.175371e+01
ddf                               1.000000e+00
distmod                           3.199610e+01
mwebv                             1.700000e-02
target                            9.200000e+01
hostgal_z                         0.000000e+00
haversine                         3.190063e-01
latlon1                          -1.528827e+00
flux                

## Test

In [4]:
df_meta_test = pd.read_csv('../data/cleaned/testing_meta_clean.csv')
df_meta_test = generate_meta_features(df_meta_test)

# NOTE: Sort test set first

df_test = pd.DataFrame()
chunks = 2500000
remain_df = None
for i, df_chunk in tqdm(enumerate(pd.read_csv('../data/processed/test_set_sorted.csv', chunksize=chunks, iterator=True))):
    # Check object_ids
    # I believe np.unique keeps the order of group_ids as they appear in the file
    unique_ids = np.unique(df_chunk['object_id'])

    new_remain_df = df_chunk.loc[df_chunk['object_id'] == unique_ids[-1]].copy()
    if remain_df is None:
        df_chunk = df_chunk.loc[df_chunk['object_id'].isin(unique_ids[:-1])]
    else:
        df_chunk = pd.concat([remain_df, df_chunk.loc[df_chunk['object_id'].isin(unique_ids[:-1])]], axis=0)
    # Create remaining samples df
    remain_df = new_remain_df

    df_tmp = generate_features(df_chunk, df_meta_test)
    if i == 0:
        df_test = df_tmp
    else:
        df_test = pd.concat([df_test, df_tmp], sort=False)
        
    del df_tmp

# process rest
df_tmp = generate_features(remain_df, df_meta_test)
df_test = pd.concat([df_test, df_tmp], sort=False)
del df_tmp

df_test.to_csv('../data/interim/testing_fe.csv', index=False)

print(len(df_test)) #should be 3492890

df_test.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
182it [5:13:04, 96.61s/it] 


3492890


Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,distmod,mwebv,hostgal_z,haversine,...,flux_by_flux_ratio_sq_max,flux_by_flux_ratio_sq_min,flux_by_flux_ratio_sq_skew,flux_by_flux_ratio_sq_kurtosis,flux_by_flux_ratio_sq_sum,flux_diff,flux_diff_mean,flux_w_mean,flux_diff_w_mean,mjd_det_diff
0,13,34.453125,-5.229529,169.987075,-59.956185,1,41.1123,0.019,0.3048,1.851382,...,19077.38,-41.837235,5.396523,31.557895,189634.6,55.445738,13.871398,24.292155,2.282455,1000.0
1,14,33.398438,-4.331149,167.226341,-59.936551,1,42.8774,0.018,0.6323,1.855173,...,1914.014,-30.436253,12.348124,156.645277,5525.817,25.981591,29.389389,6.852393,3.791608,1000.0
2,17,348.529419,-61.75544,321.29398,-51.763351,1,43.6,0.016,0.8297,0.309914,...,1088.702,-109.941461,9.923556,119.717988,4124.4,30.964024,39.143819,5.255113,5.89217,1000.0
3,23,34.804688,-5.829153,171.307861,-60.174401,1,42.964,0.023,0.6533,1.845038,...,2208.811,-69.335479,9.227223,93.100472,8293.673,40.693061,41.934474,9.467365,4.298245,1000.0
4,34,351.321442,-64.198746,317.458993,-50.429931,1,42.054,0.023,0.4557,0.391772,...,1365432.0,-51.048373,10.251332,117.972119,4815012.0,137.715186,30.068359,101.128982,1.361778,1000.0


## Polynomial 2 (Not better with meta, try later with time series)

In [5]:
# # Train
# id_features = df_meta[['object_id', 'ddf', 'target']]
# df_meta = df_meta.drop(['object_id', 'ddf', 'target'], axis=1)

# poly = PolynomialFeatures(2, include_bias=False)
# poly_output = poly.fit_transform(df_meta)
# target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(df_meta.columns,p) for p in poly.powers_]]
# df_meta = pd.DataFrame(poly_output, columns = target_feature_names)

# df_meta = pd.concat([id_features, df_meta],axis=1)

# # Test
# id_features = df_meta_test[['object_id', 'ddf']]
# df_meta_test = df_meta_test.drop(['object_id', 'ddf'], axis=1)

# poly = PolynomialFeatures(2, include_bias=False)
# poly_output = poly.fit_transform(df_meta_test)
# target_feature_names = ['x'.join(['{}^{}'.format(pair[0],pair[1]) for pair in tuple if pair[1]!=0]) for tuple in [zip(df_meta_test.columns,p) for p in poly.powers_]]
# df_meta_test = pd.DataFrame(poly_output, columns = target_feature_names)

# df_meta_test = pd.concat([id_features, df_meta_test],axis=1)