**Random Forest** algorithm has been chosen to solve this classification problem:

In [1]:
# Libraries
#-- data treatment
import pandas as pd

#-- Random Forest classifier  (sklearn)
from sklearn.ensemble import RandomForestClassifier

#-- model goodness
from sklearn.metrics import confusion_matrix


In [2]:
# Import train data
data = pd.read_csv('space_X_train.csv',index_col = [0])
data

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,target
0,0.474335,0.881640,-2.050543,-1.251935,-1.035151,-1.934367,4
1,-1.034675,1.741801,-1.660629,-1.555989,-0.337553,-2.473838,0
2,-0.563221,-0.688381,-0.318415,-1.845172,0.352366,-0.912928,1
3,-1.268179,2.770688,1.054193,2.830389,0.395093,0.677715,3
4,-1.216380,-0.391267,-2.898931,-0.913074,-2.171857,-2.367490,0
...,...,...,...,...,...,...,...
2095,-0.947002,0.174969,-0.180249,0.118236,-0.143420,-0.359509,0
2096,-1.224148,0.029956,0.536684,0.888406,0.119460,0.378946,1
2097,-0.847593,1.538574,0.962409,1.966467,0.379934,0.737907,3
2098,1.472437,1.187695,1.015766,-1.103522,1.586722,0.182569,4


In [3]:
# check if there is null values in any column
data.isna().values.any()

False

In [4]:
# check if class is balanced
data['target'].value_counts()/len(data)

0    0.203333
3    0.202857
1    0.201429
2    0.199524
4    0.192857
Name: target, dtype: float64

There are not null values and classes are balanced (equally represented). Great!<br>

In [5]:
# separate data in X and y
X = data[["sensor_1","sensor_2","sensor_3","sensor_4","sensor_5","sensor_6"]].values
y = data["target"].values

### RF Algorithm

Hiperparameters: number of trees (n_estimators) and max_features (usually square root or log2 of number of features are good estimations)

In [6]:
rf = RandomForestClassifier(criterion='gini', 
                            # different n_estimators have been tested
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='log2',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(X, y)
'''oob_score, default=False
   whether to use out-of-bag samples to estimate how well the odel generalizes'''
print("%.4f" % rf.oob_score_)

0.7786


In [7]:
resultados = pd.DataFrame(columns = ["real", "pred"])
resultados["real"] = data["target"]
resultados["pred"] = rf.predict(X)
print(f'Accuracy: {len(resultados[resultados["real"] == resultados["pred"]])/len(resultados)}')

Accuracy: 0.9304761904761905


In [8]:
confusion_matrix(data["target"], resultados["pred"])

array([[400,  10,   9,   6,   2],
       [  2, 400,  11,  10,   0],
       [  5,  10, 383,   7,  14],
       [  4,  11,   9, 388,  14],
       [  6,   1,   7,   8, 383]])

In [9]:
# Import test data to predict
test = pd.read_csv('space_X_test.csv',index_col = [0])
X_test = test.values

In [10]:
# import test data and make predictions
predictions = pd.DataFrame(columns = ["final_status"])
predictions["final_status"] = rf.predict(X_test)

In [11]:
predictions.to_csv("predictions.csv", index = False)