# Full Raw View Data Creation Notebook

### 1. Installing and Importing the necessary libraries

In [None]:
# INSTALLS
!pip install lightkurve

In [None]:
# IMPORTS
import lightkurve as lk

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
pd.set_option('display.max_rows', 100)

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import linear_model

import numpy as np

import warnings
warnings.filterwarnings("ignore")

### 2. Processing the KIC data

In [None]:
# READING THE CSV
data = pd.read_csv('Kepler_Data_Raw.csv', skiprows=9)
data = data[data['koi_disposition'].str.contains('CANDIDATE')==False] # removing 'candidate' rows
data = data.replace(['CONFIRMED', 'FALSE POSITIVE'], [1, 0])

In [None]:
# BINARY CLASSIFICATION DF
binaryData = data.groupby('kepid').agg(target = ('koi_disposition', min)).reset_index()

### 3. Main Data Creation Loop

In [None]:
# DOWNLOADING AND PROCESSING THE LIGHTKURVES

fluxData = []
targetData = []
i = 0

for kepid in binaryData['kepid'].iloc[0:500]:
  try: 

    # FORMATTING THE KEPID STRING AND DOWNLOADING THE DATA FROM LIGHTKURVE
    KIC = 'KIC ' + str(kepid)
    lc = lk.search_lightcurve(KIC, author='kepler', cadence='long').download_all()

    # FINDING THE TARGET DATA FOR THE KEPID
    row_number = binaryData[binaryData['kepid'] == kepid].index[0]
    target = binaryData['target'][row_number]

    # PROCESSING THE LIGHTKURVE
    lc_cleaned = lc.stitch().remove_outliers().remove_nans().normalize()
    lc_cleaned_bin = lc_cleaned.bin(time_bin_size = 0.5, time_bin_start = 130)
    df = lc_cleaned_bin.to_pandas().reset_index()
    time_flux_df = df[['time', 'flux']]
    time_flux_df['time'] = pd.to_datetime(time_flux_df['time'])
    time_flux_df = time_flux_df.set_index('time')

    # APPLYING MICE FOR TIME-SERIES IMPUTATION
    df_mice = time_flux_df.filter(['flux'], axis=1).copy()
    # Define MICE Imputer and fill missing values
    mice_imputer = IterativeImputer(estimator=linear_model.BayesianRidge(), n_nearest_features=None, imputation_order='ascending')
    df_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(df_mice), columns=df_mice.columns)

    # APPENDING THE DATA TO LISTS
    fluxData.append(df_mice_imputed['flux'].to_list())
    targetData.append(target)

    # COUNTER AND INCREMENT
    print('Successful: ', i)
    i += 1

  except:
    print('Not successful: ', i)
    i += 1
    pass
    
  

Successful:  0
Successful:  1
Successful:  2
Successful:  3
Successful:  4


### 4. Processing the data

In [None]:
# PROCESSING DATA

# CONVERTING ALL THE DATA INTO A SINGLE DF
fullLightkurveData = pd.DataFrame(fluxData)
fullLightkurveData['target'] = targetData

# COUNTING HOW MANY ROWS DF
unsuccessful_percent =  round(100*(fullLightkurveData.shape[0] / binaryData['kepid'].iloc[0:500].shape[0]))
print('There is ', unsuccessful_percent, '% of the original data which could be used')

# REMOVING ALL NANS FROM THE DF
nan_df = fullLightkurveData[fullLightkurveData.isna().any(axis=1)]
nan_percent = round(100*(nan_df.shape[0] / fullLightkurveData.shape[0]))
print('There is ', nan_percent, '% of the data which contained nans')
fullLightkurveData = fullLightkurveData.dropna()
fullLightkurveData

There is  100 % of the original data which could be used
There is  0 % of the data which contained nans


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2914,2915,2916,2917,2918,2919,2920,2921,2922,target
0,0.999498,0.999498,0.999498,1.000843,0.999239,0.998717,0.999009,1.000242,1.000556,1.004655,...,1.002233,1.001194,1.000569,1.000624,1.000572,1.00041,1.000286,0.999472,0.999181,1
1,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,...,0.999043,0.999329,0.999545,0.999879,0.999941,1.000078,1.000191,1.000228,1.00079,0
2,0.999988,0.999988,0.999988,0.998625,0.998938,0.999561,0.999891,1.00023,1.00096,1.001721,...,0.999728,1.000094,1.000466,1.000359,1.000101,1.000118,1.000004,0.999557,0.999436,0
3,0.98575,0.999616,0.999616,0.974079,0.975111,0.977444,0.980781,0.984821,0.988925,0.992545,...,0.999368,0.999984,1.000376,1.000485,1.000632,1.000815,1.001115,1.001891,1.002667,1
4,0.999999,0.999999,0.999999,1.000008,1.000098,1.000089,0.999967,0.999921,0.999992,0.999881,...,1.000167,1.000161,0.999991,0.999776,0.999548,0.999394,0.999586,0.999746,0.999799,0
