# Project M9: Exoplanet detection using the transit method

Marko Raidlo, Raido Everest

In [None]:
# We will uniformly use random state = 3 for replicability of results.

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)
#plt.rcParams['figure.dpi'] = 1000
plt.rcParams.update({'font.size': 30})

import numpy as np
import pandas as pd
from scipy import ndimage, fft
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

## Loading in the datasets

In [None]:
train = pd.read_csv('exoTrain.csv')
test = pd.read_csv('exoTest.csv')
    
train.columns = [train.columns[i].replace("FLUX.", "") for i in range(len(train.columns))]
test.columns = [test.columns[i].replace("FLUX.", "") for i in range(len(test.columns))]
    
#Replacing 2s and 1s with 1s and 0s.
train.LABEL = train.LABEL.replace(1, 0)
train.LABEL = train.LABEL.replace(2, 1)
test.LABEL = test.LABEL.replace(1, 0)
test.LABEL = test.LABEL.replace(2, 1)
    
#X and Y datasets
train_X = train.drop('LABEL', axis=1)
train_Y = train["LABEL"]

test_X = test.drop('LABEL', axis=1)
test_Y = test["LABEL"]

train_X.head()

## Preprocessing

In [None]:
def fourier(df):
        return np.abs(fft(df, n=df.size))

def pre_process(df):
    #Preprocessing method applies:
    # Fourier transform
    # Normalization
    # Gaussian filter
    # Standardization
    
    #Fourier
    df_copy = df.copy()
    df_copy = df_copy.apply(fourier ,axis=1)
    
    df_copy = pd.DataFrame(df_copy.tolist())
    df_copy  = df_copy.iloc[:,0:df_copy.shape[1]//2]
    
    # Normalize
    df_copy = pd.DataFrame(normalize(df_copy))

    # Gaussian filter to smooth out data
    df_copy = ndimage.filters.gaussian_filter(df_copy, sigma=10)

    # Standardize X data
    std_scaler = StandardScaler()
    df_copy = std_scaler.fit_transform(df_copy)
    
    return pd.DataFrame(df_copy)

In [None]:
#Pre process plot

plt.subplot(211)
plt.title("Exoplanet")
train_X.iloc[1].plot()
plt.ylabel("FLUX")
plt.subplot(212)
plt.title("No exoplanet")
train_X.iloc[3213].plot()
plt.ylabel("FLUX")

plt.subplots_adjust(hspace=0.5)
#plt.savefig('fig1.png', dpi = 1000)
plt.show()

In [None]:
# Processing data
train_X = pre_process(train_X)
test_X = pre_process(test_X)

In [None]:
#Post process plot

plt.subplot(211)
plt.title("Exoplanet")
train_X.iloc[1].plot()
plt.ylabel("Level")
plt.xlabel("Frequency")
plt.subplot(212)
plt.title("No exoplanet")
train_X.iloc[3213].plot()
plt.ylabel("Level")
plt.xlabel("Frequency")

plt.subplots_adjust(hspace=0.5)
#plt.savefig('fig2.png', dpi = 1000)
plt.show()

In [None]:
# Exoplanet star results
plt.subplot(221)
train_X.iloc[12].plot()
plt.subplot(222)
train_X.iloc[21].plot()
plt.subplot(223)
train_X.iloc[23].plot()
plt.subplot(224)
train_X.iloc[32].plot()
#plt.savefig('fig3.png', dpi = 1000)

plt.show()

In [None]:
# Non exoplanet star results
plt.subplot(221)
train_X.iloc[1111].plot()
plt.subplot(222)
train_X.iloc[2231].plot()
plt.subplot(223)
train_X.iloc[4512].plot()
plt.subplot(224)
train_X.iloc[1233].plot()
#plt.savefig('fig4.png', dpi = 1000)

plt.show()

## Building models

In [None]:
#Evaluation method:

def evaluate(model):
    train_result =  model.predict(train_X)
    test_result = model.predict(test_X)
    confm_train = confusion_matrix(train_Y, train_result)
    confm_test = confusion_matrix(test_Y, test_result)
    
    acc_train = accuracy_score(train_Y, train_result)
    acc_test = accuracy_score(test_Y, test_result)
    pre_train = precision_score(train_Y, train_result)
    pre_test = precision_score(test_Y, test_result)
    rec_train = recall_score(train_Y, train_result)
    rec_test = recall_score(test_Y, test_result)
    auc_train = roc_auc_score(train_Y, train_result)
    auc_test = roc_auc_score(test_Y, test_result)
    
    print("------------------Evaluation------------------")
    print("On training set:")
    print("Accuracy:", acc_train)
    print("Precision:", pre_train)
    print("Recall:", rec_train)
    print("AUC score:", auc_train)
    print("Confusion matrix:")
    print(confm_train)
    print("----------------------------------------------")
    print("On test set:")
    print("Accuracy:", acc_test)
    print("Precision:", pre_test)
    print("Recall:", rec_test)
    print("AUC score:", auc_test)
    print("Confusion matrix:")
    print(confm_test)
    print("--------------End of valuation----------------")

### Logistic Regression

In [None]:
model_log = LogisticRegression(random_state = 3, solver='lbfgs')
model_log.fit(train_X, train_Y)

evaluate(model_log)

### Support Vector Machine

In [None]:
model_svm = svm.LinearSVC(random_state = 3, max_iter = 1000)
model_svm.fit(train_X, train_Y)
evaluate(model_svm)

### K Nearest Neighbours

In [None]:
model_knn = KNeighborsClassifier(n_neighbors = 9, weights = 'distance')
model_knn.fit(train_X, train_Y)

evaluate(model_knn)

### Decision tree

In [None]:
model_tree = DecisionTreeClassifier(random_state = 3)
model_tree.fit(train_X, train_Y)

evaluate(model_tree)

### Random forest

In [None]:
model_forest = RandomForestClassifier(random_state = 3, n_estimators=5, max_depth=3)
model_forest.fit(train_X, train_Y)

evaluate(model_forest)

### Clustering

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=2, random_state=3).fit(train_X, train_Y)

evaluate(clf)