In [1]:
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd

import sklearn

from xgboost import XGBClassifier
import time
import cudf

In [2]:
sdss_df = cudf.read_csv('./data/Skyserver_SQL2_27_2018 6_51_39 PM.csv', skiprows=0)
sdss_df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.72237e+18,STAR,-0.000111,3306,54922,510
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.72237e+18,STAR,0.00059,3306,54922,512


In [3]:
sdss_df.drop(['objid', 'run', 'rerun', 'camcol', 'field', 'specobjid'], axis=1, inplace=True)
sdss_df.head()

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift,plate,mjd,fiberid
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,STAR,-9e-06,3306,54922,491
1,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,STAR,-5.5e-05,323,51615,541
2,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,GALAXY,0.123111,287,52023,513
3,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,STAR,-0.000111,3306,54922,510
4,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,STAR,0.00059,3306,54922,512


In [4]:
sdss_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ra        10000 non-null  float64
 1   dec       10000 non-null  float64
 2   u         10000 non-null  float64
 3   g         10000 non-null  float64
 4   r         10000 non-null  float64
 5   i         10000 non-null  float64
 6   z         10000 non-null  float64
 7   class     10000 non-null  object
 8   redshift  10000 non-null  float64
 9   plate     10000 non-null  int64
 10  mjd       10000 non-null  int64
 11  fiberid   10000 non-null  int64
dtypes: float64(8), int64(3), object(1)
memory usage: 946.4+ KB


In [5]:
df_cols = list(sdss_df.columns)
df_cols.remove("class")

convert_data_type = {}

for col in df_cols:
    convert_data_type[col] = "float32"

    convert_data_type

In [6]:
sdss_df = sdss_df.astype(convert_data_type, copy=False)

In [7]:
sdss_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ra        10000 non-null  float32
 1   dec       10000 non-null  float32
 2   u         10000 non-null  float32
 3   g         10000 non-null  float32
 4   r         10000 non-null  float32
 5   i         10000 non-null  float32
 6   z         10000 non-null  float32
 7   class     10000 non-null  object
 8   redshift  10000 non-null  float32
 9   plate     10000 non-null  float32
 10  mjd       10000 non-null  float32
 11  fiberid   10000 non-null  float32
dtypes: float32(11), object(1)
memory usage: 516.7+ KB


objid and specobjid are just identifiers for accessing the rows back when they were stored in the original databank. Therefore we will not need them for classification as they are not related to the outcome.

Even more: The features 'run', 'rerun', 'camcol' and 'field' are values which describe parts of the camera at the moment when making the observation, e.g. 'run' represents the corresponding scan which captured the oject.

We will drop these columns as any correlation to the outcome would be coincidentally.

Source: http://www.sdss3.org/dr9/imaging/imaging_basics.php

In [8]:
import cuml
le = cuml.preprocessing.LabelEncoder()

# encode class labels to integers
y_encoded = le.fit_transform(sdss_df['class'])
sdss_df['class'] = y_encoded

sdss_df.head()

Unnamed: 0,ra,dec,u,g,r,i,z,class,redshift,plate,mjd,fiberid
0,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,2,-9e-06,3306.0,54922.0,491.0
1,183.598373,0.135285,18.6628,17.214491,16.676371,16.48922,16.3915,2,-5.5e-05,323.0,51615.0,541.0
2,183.680206,0.126185,19.38298,18.19169,17.474279,17.08732,16.80125,0,0.123111,287.0,52023.0,513.0
3,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,2,-0.000111,3306.0,54922.0,510.0
4,183.883286,0.102557,17.550249,16.26342,16.43869,16.55492,16.61326,2,0.00059,3306.0,54922.0,512.0


## Data Normalization (scaling)

In [9]:
#from sklearn import preprocessing
scaler = cuml.preprocessing.MinMaxScaler()

In [10]:
sdss = scaler.fit_transform(sdss_df.drop('class', axis=1))

In [84]:
X_train, X_test, y_train, y_test = cuml.model_selection.train_test_split(sdss, sdss_df['class'], test_size=0.1)

## KNN

In [12]:
knn = cuml.neighbors.KNeighborsClassifier()

In [13]:
training_start = time.perf_counter()
knn.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = knn.predict(X_test)
prediction_end = time.perf_counter()
preds = preds
acc_knn = (preds == y_test).sum().astype(float) / len(preds)*100
knn_train_time = training_end-training_start
knn_prediction_time = prediction_end-prediction_start

In [14]:
print("Scikit-Learn's K Nearest Neighbors Classifier's prediction accuracy is: %3.2f" % (acc_knn))
print("Time consumed for training: %4.3f seconds" % (knn_train_time))
print("Time consumed for prediction: %6.5f seconds" % (knn_prediction_time))

Scikit-Learn's K Nearest Neighbors Classifier's prediction accuracy is: 90.30
Time consumed for training: 1.249 seconds
Time consumed for prediction: 0.64266 seconds


## Random Forest Classifier

In [15]:
rfc = cuml.ensemble.RandomForestClassifier(n_estimators=100)

In [16]:
training_start = time.perf_counter()
rfc.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = rfc.predict(X_test)
prediction_end = time.perf_counter()
preds = preds
acc_rfc = (preds == y_test).sum().astype(float) / len(preds)*100
rfc_train_time = training_end-training_start
rfc_prediction_time = prediction_end-prediction_start

In [17]:
print("Scikit-Learn's Random Forest Classifier's prediction accuracy is: %3.2f" % (acc_rfc))
print("Time consumed for training: %4.3f seconds" % (rfc_train_time))
print("Time consumed for prediction: %6.5f seconds" % (rfc_prediction_time))

Scikit-Learn's Random Forest Classifier's prediction accuracy is: 98.60
Time consumed for training: 0.759 seconds
Time consumed for prediction: 0.59565 seconds


## XGBoost

In [86]:
xgb = XGBClassifier(n_estimators=100, tree_method="gpu_hist", use_label_encoder=False)

In [87]:
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start



In [88]:
print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

XGBoost's prediction accuracy is: 98.70
Time consumed for training: 0.410
Time consumed for prediction: 0.00868 seconds


## Logistic Regression

In [122]:
from cuml.linear_model.logistic_regression import LogisticRegression
lr=LogisticRegression()
lr.fit(X_train,y_train)
training_start = time.perf_counter()
lr.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
preds = lr.predict(X_test)
prediction_end = time.perf_counter()
acc_lr = (preds == y_test).sum().astype(float) / len(preds)*100
lr_train_time = training_end-training_start
lr_prediction_time = prediction_end-prediction_start

print("Logistic Regression's prediction accuracy is: %3.2f" % (acc_lr))
print("Time consumed for training: %4.3f seconds" % (lr_train_time))
print("Time consumed for prediction: %6.5f seconds" % (lr_prediction_time))

Logistic Regression's prediction accuracy is: 89.10
Time consumed for training: 0.073 seconds
Time consumed for prediction: 0.00308 seconds


In [123]:
results = cudf.DataFrame({
    'Model': ['KNN', 
              'XGBoost', 'Random Forest','Logistic Regression'],
    'Score': [acc_knn, acc_xgb, acc_rfc,acc_lr],
    'Runtime Training': [knn_train_time, xgb_train_time, rfc_train_time,lr_train_time],
    'Runtime Prediction': [knn_prediction_time, xgb_prediction_time, rfc_prediction_time,lr_prediction_time]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Model')
result_df

Unnamed: 0_level_0,Score,Runtime Training,Runtime Prediction
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XGBoost,98.7,0.410075,0.008675
Random Forest,98.6,0.759215,0.595646
KNN,90.3,1.249325,0.642659
Logistic Regression,89.1,0.073388,0.003076



<br>
<div align="center"><h2>Conclusion on GPU model classification</h2></div>


XGBoost and Random Forest Classifier could achieve very high accuracy.As expected, there's a huge improvement in Runtime Training, however it took slightly longer Runtime Prediction.
Logistic Regression is best in terms of runtime training. Therefore, if we have a lot of rows of data, we should consider using it. 

From this, we can see after GPU Convert to run on GPU, we are able to reduce the runtime training as compared to using CPU.(A few times in order of reduction) However, runtime prediction might not see that much of a jump. 
In conclusion, XGBoost is the best in terms of score for both CPU and GPU. 


<br>
<div align="center"><h2>Please Restart the Kernel</h2></div>

In [66]:
#import IPython
#app = IPython.Application.instance()
#app.kernel.do_shutdown(True)