In [None]:
!pip install catboost
!pip install lightgbm

In [None]:
!nvidia-smi

In [None]:
from sklearn.metrics import r2_score,accuracy_score, confusion_matrix, classification_report,precision_recall_fscore_support
import numpy as np
import pandas as pd
import sklearn
import dask_ml.model_selection as dcv

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm, CatBoostClassifier

from sklearn.model_selection import RepeatedKFold
from cuml.model_selection import train_test_split
from sklearn.model_selection import train_test_split as skl_train_test_split
from sklearn.preprocessing import RobustScaler

from cuml import preprocessing
import cuml

from cuml.ensemble import RandomForestClassifier as cuRFC
from cuml.neighbors import KNeighborsRegressor
from cuml.ensemble import RandomForestRegressor as curfc


cuml.set_global_output_type('numpy')
import cudf
import numpy as np
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')

import cupy
import matplotlib.pyplot as plt
from matplotlib import pyplot
plt.rcParams["figure.figsize"] = (20,20)


In [None]:
# Read CSV from DR15 SDSS
df_train = pd.read_csv('./Data/SDSS_DR15_new_PCunha.csv')
list_features = ['psfMag_u', 'psfMag_g','psfMag_r','psfMag_i','psfMag_z','cModelMag_r','w1mpro','w2mpro','w3mpro','w4mpro','class','z_photo','z_spec']
df_train = df_train[list_features]

print('Data selected from CSV file.\n')

# Add resolved_r feature to simulate Clarke et al features
df_train['resolved_r']=df_train['psfMag_r']-df_train['cModelMag_r']

# Class convertion into 0, 1, 2
le = sklearn.preprocessing.LabelEncoder()
df_train['class'] = le.fit_transform(df_train['class'])
print('Labels [0,1,2]: ',le.inverse_transform([0,1,2]))
print('Data encoded.\n')

In [None]:
def create_colours(data, features):
    """Create dataframe with colour-colour data. Return an Array and list of colours.

    Args:
        data (DataFrame): Dataframe with photometric data
        features (list): List of features from photometric data
    """
    N = len(data)
    F = len(features)
    n=0
    for i in np.linspace(1,len(features),len(features),dtype=int):
    	n = n + (i-1)

    df_features = np.zeros((N, n))
    y=0
    lista=[]
    for z in np.linspace(0,F,F,dtype=int):
    	for x in np.linspace(1,F-1,F-1,dtype=int):
        	if z!=x and z<x:
        		df_features[:,y] = data[features[z]] - data[features[x]]
        		y+=1
        		lista += [features[z]+'-'+features[x]]
        	else:
           		pass
    df_colours = pd.DataFrame(df_features,columns = lista)
    return df_colours

def create_ratio(data, features):
    """Create dataframe with features ratio data. Return an Array and list of ratios.

    Args:
        data (DataFrame): Dataframe with photometric data
        features (list): List of features from photometric data
    """
    N = len(data)
    F = len(features)
    n=0
    for i in np.linspace(1,len(features),len(features),dtype=int):
    	n = n + (i-1)

    df_features = np.zeros((N, n))
    y=0
    lista=[]
    for z in np.linspace(0,F,F,dtype=int):
    	for x in np.linspace(1,F-1,F-1,dtype=int):
        	if z!=x and z<x:
        		df_features[:,y] = data[features[z]] / data[features[x]]
        		y+=1
        		lista += [features[z]+'/'+features[x]]
        	else:
           		pass
    df_colours = pd.DataFrame(df_features,columns = lista)
    return df_colours

list_col = ['psfMag_u', 'psfMag_g','psfMag_r','psfMag_i','psfMag_z','w1mpro','w2mpro','w3mpro','w4mpro']
df_colours = create_colours(df_train,list_col)
#df_ratio = create_ratio(df_train,list_col)

# Add flag_photo_z
flag_photo_z_train = pd.read_csv('./flag_train_z_photo.csv')
flag_photo_z_train.columns=['index','label_photo_z']
flag_photo_z_train.set_index('index', inplace=True)

flag_photo_z_test = pd.read_csv('./flag_test_z_photo.csv')
flag_photo_z_test.columns=['index','label_photo_z']
flag_photo_z_test.set_index('index', inplace=True)

flag_photo = pd.concat([flag_photo_z_train,flag_photo_z_test],axis=0)
flag_photo.sort_index(inplace=True)

# Add predicted z_spec
z_spec = pd.read_csv('./oof_z_spec_pred.csv')
z_spec.columns=['index','pred_z_spec']
z_spec.set_index('index', inplace=True)


#Concat dataframes

df = pd.concat([df_train,df_colours.reindex(df_train.index)], axis=1, sort=False)
#df = pd.concat([df,df_ratio.reindex(df.index)], axis=1, sort=False)
df = pd.merge(df,flag_photo, left_index=True, right_index=True)
df = pd.merge(df,z_spec, left_index=True, right_index=True)

features = df.columns.values.tolist()
features.remove('class')
features.remove('cModelMag_r')
features.remove('z_photo')
features.remove('z_spec')
features.remove('pred_z_spec')
features.remove('label_photo_z')


targets= ['class']

In [None]:
def metric_scores(x,y):
    print("Classification Report RF: \n", classification_report(x, y, digits=5))
    print("Confusion Matrix: \n", confusion_matrix(x, y))

    
import seaborn as sns
def plot_feature_importance(importance,names,model_type):
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
#df[features] = RobustScaler().fit_transform(df[features])

In [None]:
# Train Test Split for GPU based algorithms
df_cudf = cudf.from_pandas(df)
X_train, X_test, y_train, y_test = train_test_split(df_cudf[features], 
                                                    df_cudf[targets], 
                                                    test_size=0.3, 
                                                    shuffle =True, 
                                                    random_state=0)

In [None]:
clf_test = y_test.to_pandas().astype(np.float32)

# Classification using photometric data


## KNN implementation


In [None]:
n_neighbors = 20

knn_clf = cuml.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)

knn_clf.fit(X_train.astype(np.float32), y_train['class'].astype(np.float32))

knn_pred = knn_clf.predict(X_test.astype(np.float32))

metric_scores(clf_test,knn_pred)

## Random Forest implementation


In [None]:
n_estimators = 500
max_depth = 10
#r_state = list(np.random.choice(99,size=40, replace=False))

#print('List Random State: ', r_state)

model = cuRFC(n_estimators = n_estimators,n_streams=1, random_state = 24,split_algo = 0)

model.fit(X_train.astype(np.float32),y_train['class'].astype(np.int32), convert_dtype=True)

rf_pred = model.predict(X_test.astype(np.float32))

metric_scores(clf_test,rf_pred)

## LightGBM Implementation


In [None]:
lgb_clf = lgb.LGBMClassifier(n_estimators = 500,objective='multiclass',seed= 42)
lgb_clf.fit(X_train.to_pandas().astype(np.float32), y_train['class'].to_pandas().astype(np.int32))

lgb_pred = lgb_clf.predict(X_test.to_pandas().astype(np.float32))

metric_scores(clf_test,lgb_pred)

##  XGBoost Implementation


In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators=500,n_jobs=-1, max_depth=14,
use_label_encoder=False,objective = 'multi:softprob',
 eval_metric = 'mlogloss',tree_method='gpu_hist', 
 predictor='gpu_predictor', random_state=24)

xgb_clf.fit(X_train.to_pandas().astype(np.float32), y_train['class'].to_pandas().astype(np.int32))

xgb_pred = xgb_clf.predict(X_test.to_pandas().astype(np.float32))

metric_scores(clf_test,xgb_pred)

# CatBoost Implementation

In [None]:
train_pool = Pool(X_train.to_pandas(), y_train.to_pandas(), feature_names=features)
test_pool = Pool(X_test.to_pandas(), y_test.to_pandas(), feature_names=features)

In [None]:
cb_clf = CatBoostClassifier(n_estimators = 500,max_depth=10,task_type="GPU", random_state= 42, verbose=0)
cb_clf.fit(train_pool)

cb_pred = cb_clf.predict(X_test.to_pandas().astype(np.float32))

metric_scores(clf_test,cb_pred)

In [None]:
plot_feature_importance(cb_clf.get_feature_importance(),features,'CatBoost')

In [None]:
cb_clf = CatBoostClassifier(n_estimators = 500,task_type="GPU", random_state= 42, verbose=0)

# Feature Selection
summary = cb_clf.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select='0-47',
    num_features_to_select=20,
    steps=28,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=False,
    logging_level='Silent',
    plot=True
)

# Classification using photometric data and predicted z

In [None]:
features = df.columns.values.tolist()
features.remove('class')
features.remove('cModelMag_r')
features.remove('z_photo')
features.remove('z_spec')
features.remove('label_photo_z')

X_train, X_test, y_train, y_test = train_test_split(df_cudf[features], 
                                                    df_cudf[targets], 
                                                    test_size=0.3, 
                                                    shuffle =True, 
                                                    random_state=0)

clf_test = y_test.to_pandas().astype(np.float32)

## KNN

In [None]:
n_neighbors = 20

knn_clf = cuml.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)

knn_clf.fit(X_train.astype(np.float32), y_train['class'].astype(np.float32))

knn_pred = knn_clf.predict(X_test.astype(np.float32))

metric_scores(clf_test,knn_pred)

# RF

In [None]:
n_estimators = 500
max_depth = 10
#r_state = list(np.random.choice(99,size=40, replace=False))

#print('List Random State: ', r_state)

model = cuRFC(n_estimators = n_estimators,n_streams=1, random_state = 24,split_algo = 0)

model.fit(X_train.astype(np.float32),y_train['class'].astype(np.int32), convert_dtype=True)

rf_pred = model.predict(X_test.astype(np.float32))

metric_scores(clf_test,rf_pred)

# LightGBM

In [None]:
lgb_clf = lgb.LGBMClassifier(n_estimators = 500,objective='multiclass',seed= 42)
lgb_clf.fit(X_train.to_pandas().astype(np.float32), y_train['class'].to_pandas().astype(np.int32))

lgb_pred = lgb_clf.predict(X_test.to_pandas().astype(np.float32))

metric_scores(clf_test,lgb_pred)

# XGBoost

In [None]:
xgb_clf = xgb.XGBClassifier(n_estimators=500,n_jobs=-1, max_depth=14,
use_label_encoder=False,objective = 'multi:softprob',
 eval_metric = 'mlogloss',tree_method='gpu_hist', 
 predictor='gpu_predictor', random_state=24)

xgb_clf.fit(X_train.to_pandas().astype(np.float32), y_train['class'].to_pandas().astype(np.int32))

xgb_pred = xgb_clf.predict(X_test.to_pandas().astype(np.float32))

metric_scores(clf_test,xgb_pred)

# CatBoost

In [None]:
train_pool = Pool(X_train.to_pandas(), y_train.to_pandas(), feature_names=features)
test_pool = Pool(X_test.to_pandas(), y_test.to_pandas(), feature_names=features)

In [None]:
cb_clf = CatBoostClassifier(n_estimators = 500,max_depth=10,task_type="GPU", random_state= 42, verbose=0)
cb_clf.fit(train_pool)

cb_pred = cb_clf.predict(X_test.to_pandas().astype(np.float32))

metric_scores(clf_test,cb_pred)

In [None]:
plot_feature_importance(cb_clf.get_feature_importance(),features,'CatBoost')

In [None]:
cb_clf = CatBoostClassifier(n_estimators = 500,task_type="GPU", random_state= 42, verbose=0)

# Feature Selection
summary = cb_clf.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select='0-47',
    num_features_to_select=20,
    steps=28,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=False,
    logging_level='Silent',
    plot=True
)