# Load the data and the required modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('data/stars_train.csv')
test = pd.read_csv('data/stars_test.csv')
object_id = np.loadtxt('data/obj_ID.csv') #Loading with numpy to avoid the roundy error made by pandas

# Data cleaning

In [98]:
train

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,redshift,label
0,1.237659e+18,232.502422,45.121027,25.65923,22.36003,21.22009,20.18024,19.43329,0.547620,0
1,1.237654e+18,129.290210,48.672374,20.28340,20.09267,19.96514,19.58195,19.42377,1.673636,2
2,1.237656e+18,256.078422,35.629789,22.09653,22.06431,21.81846,21.55368,20.71617,1.141341,2
3,1.237679e+18,15.237270,11.871627,22.53424,21.76865,21.77098,21.59392,22.15125,0.000447,1
4,1.237665e+18,128.023708,25.454899,22.99427,21.43313,19.53053,18.75837,18.35530,0.453795,0
...,...,...,...,...,...,...,...,...,...,...
52290,1.237661e+18,140.043373,35.237612,25.30457,25.57575,21.71407,20.48397,19.74290,0.885847,0
52291,1.237662e+18,226.644371,44.653687,20.97036,20.30314,20.06663,20.23149,19.99547,-0.000457,1
52292,1.237668e+18,223.750703,13.223071,24.54803,23.03375,21.34179,20.19960,19.65182,0.694280,0
52293,1.237674e+18,115.159285,42.273436,22.77362,20.81738,20.32921,20.41112,20.88642,-0.000160,1


In [99]:
test

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,redshift
0,1.237671e+18,44.705691,33.993858,19.51527,18.40343,18.03881,17.87942,17.85744,-0.000176
1,1.237661e+18,132.101279,31.861286,25.98949,22.63758,21.74283,20.60178,19.82636,0.699970
2,1.237680e+18,338.396724,24.538435,24.28447,24.52921,22.09690,20.87217,19.96444,-0.000218
3,1.237679e+18,322.095785,0.677901,23.73190,22.65734,22.27631,22.68605,21.90804,0.471956
4,1.237668e+18,242.657170,12.065346,21.43320,21.30294,20.41931,19.64881,19.24359,0.647077
...,...,...,...,...,...,...,...,...,...
25753,1.237672e+18,302.926269,58.816365,19.85081,18.50319,17.92703,17.68414,17.55972,-0.000185
25754,1.237665e+18,151.798957,29.685161,22.53443,20.93578,18.99101,18.35113,17.89552,0.369260
25755,1.237658e+18,188.994240,52.102761,23.80252,21.97555,20.84109,19.98111,19.74125,0.595525
25756,1.237664e+18,11.560214,0.000378,19.87143,17.87807,16.83970,16.42124,16.07215,0.111719


In [100]:
#print the number of labels in each category
print(train['label'].value_counts())

0    31900
1    11019
2     9376
Name: label, dtype: int64


In [101]:
#count the number of nan values in each column
train.isnull().sum()

obj_ID      0
alpha       0
delta       0
u           0
g           0
r           0
i           0
z           0
redshift    0
label       0
dtype: int64

# Training a random forest

In [102]:
#split the data into a training set and a validation set
from sklearn.model_selection import train_test_split
X = train.drop('label', axis=1)
y = train['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [103]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(35037, 9) (17258, 9) (35037,) (17258,)


In [104]:
#try a multiclass classification with a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test, y_pred))   

0.9760690694170819


In [105]:
#print the confusion matrix
print(confusion_matrix(y_test, y_pred))

[[10443    18   131]
 [    4  3543     0]
 [  259     1  2859]]


In [106]:
#print the number of 0 1 and 2 y_pred
print(np.unique(y_pred, return_counts=True))

(array([0, 1, 2], dtype=int64), array([10706,  3562,  2990], dtype=int64))


# Submit the result

In [107]:
#export x_pred['obj_ID'] and y_pred to a csv file
y_pred = rfc.predict(test)
df = pd.DataFrame({'obj_ID':object_id, 'label':y_pred})
df.to_csv('submission.csv', index=False)

: 