# DS3 Datathon 2024 - Celestial Bodies

Description

### Imports

In [None]:
# Numbers
import pandas as pd
import numpy as np

# Graphs
import seaborn as sns
import matplotlib.pyplot as plt

# ML
import sklearn
import sklearn.model_selection
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
import random

# Pipeline
from sklearn.model_selection import train_test_split # For train/test splits
from sklearn.feature_selection import VarianceThreshold # Feature selector
from sklearn.pipeline import Pipeline # For setting up pipeline

# Various pre-processing steps
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV # For optimization

from sklearn.metrics import balanced_accuracy_score

Setting final/develpment mode

In [None]:
generating_final_result = True

Fixing seeds
[Source](https://sklearn-genetic-opt.readthedocs.io/en/stable/tutorials/reproducibility.html)

In [None]:
random_seed = 5643
np.random.seed(random_seed)
random.seed(random_seed)

## Loading Data
[Instruction for getting API key](https://www.kaggle.com/discussions/general/74235)

Upload your kaggle.json file with API key for authentication to download the file.

In [None]:
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
! kaggle competitions download -c ds3-datathon-celestial-labelling
! unzip "ds3-datathon-celestial-labelling.zip" -d space
! rm "ds3-datathon-celestial-labelling.zip"

Downloading ds3-datathon-celestial-labelling.zip to /content
  0% 0.00/6.40M [00:00<?, ?B/s] 78% 5.00M/6.40M [00:00<00:00, 50.8MB/s]
100% 6.40M/6.40M [00:00<00:00, 49.9MB/s]
Archive:  ds3-datathon-celestial-labelling.zip
  inflating: space/celestial_sample_submission.csv  
  inflating: space/celestial_test.csv  
  inflating: space/celestial_train.csv  


Loading training data

In [None]:
data_train = pd.read_csv("/content/space/celestial_train.csv")

Separating X values for data_train

In [None]:
data_trainX = data_train.loc[:,data_train.columns != 'class']

Separating Y values for data_test

In [None]:
data_trainY = data_train["class"]

## Data Cleaning
[Source](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

One hot encoding

In [None]:
le = LabelEncoder()
le.fit(data_trainY)
data_trainY = le.transform(data_trainY)
data_trainY

array([0, 0, 0, ..., 0, 0, 0])

### Dropping insignificant columns (Determined by feature importance)

In [None]:
data_trainX = data_trainX.drop(columns=['id', 'fiber_ID', 'cam_col', 'rerun_ID', 'alpha', 'delta', 'run_ID', 'field_ID'])

# Pipeline

In [None]:
if not generating_final_result:
  X_train, X_test, y_train, y_test = train_test_split(
      data_trainX,
      data_trainY,
      test_size=1/3,
      random_state=0)

  print(X_train.shape)
  print(X_test.shape)
else:
  X_train = data_trainX
  y_train = data_trainY

(33333, 9)
(16667, 9)


In [None]:
pipe = \
Pipeline(steps=[('scaler', StandardScaler()),
('selector',  SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))),
('classifier', RandomForestClassifier())])

pipe.fit(X_train,y_train)
if not generating_final_result:
  y_pred = pipe.predict(X_test)

print('Training set score: ' + str(pipe.score(X_train,y_train)))
if not generating_final_result:
  print('Test set score: ' + str(pipe.score(X_test,y_test)))

if not generating_final_result:
  print()
  print("Accuracy on test data:", accuracy_score(y_test, y_pred))
  print("Ballanced accuracy on test data:", balanced_accuracy_score(y_test, y_pred))



Training set score: 1.0
Test set score: 0.9788204235915282

Accuracy on test data: 0.9788204235915282
Ballanced accuracy on test data: 0.9716476991795141


# Optimization

In [None]:
parameters = {'scaler': [StandardScaler(), MinMaxScaler(),
              Normalizer(), MaxAbsScaler()],
              'classifier__max_depth': [2,4,6],
              'classifier__min_samples_leaf': [x for x in range(1,10)],
              }

In [None]:
if not generating_final_result:
  grid = GridSearchCV(pipe, parameters, cv=2).fit(X_train, y_train)

  y_pred = grid.predict(X_test)

  print('Training set score: ' + str(grid.score(X_train, y_train)))
  print('Test set score: ' + str(grid.score(X_test, y_test)))
  print()
  print("Accuracy on test data:", accuracy_score(y_test, y_pred))
  print("Ballanced accuracy on test data:", balanced_accuracy_score(y_test, y_pred))

8 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 345, in fit
    X, y = self._validate_data(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 584, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "

Training set score: 0.971229712297123
Test set score: 0.9698206035879282

Accuracy on test data: 0.9698206035879282
Ballanced accuracy on test data: 0.9600508025367246


In [None]:
# Access the best set of parameters
best_params = grid.best_params_
print(best_params)
# Stores the optimum model in best_pipe
best_pipe = grid.best_estimator_
print(best_pipe)

{'classifier__max_depth': 6, 'classifier__min_samples_leaf': 2, 'scaler': MaxAbsScaler()}
Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('selector',
                 SelectFromModel(estimator=LinearSVC(C=0.1, dual=False,
                                                     penalty='l1'))),
                ('classifier',
                 RandomForestClassifier(max_depth=6, min_samples_leaf=2))])


## Creating data for submission

Loading, standarizing and preparing results

### Loading test data

In [None]:
data_test = pd.read_csv("/content/space/celestial_test.csv")

### Cleaning test data

Dropping colums determined by feature importance

In [None]:
y_pred = pipe.predict(data_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- alpha
- cam_col
- delta
- fiber_ID
- field_ID
- ...


In [None]:
y_pred = le.inverse_transform(y_pred)

### Saving predictions to CSV

In [None]:
return_dataset = pd.DataFrame({'id': ids, 'output': y_pred}, columns=['id', 'output'])
return_dataset

Unnamed: 0,id,output
0,50000,QSO
1,50001,GALAXY
2,50002,QSO
3,50003,GALAXY
4,50004,STAR
...,...,...
49995,99995,GALAXY
49996,99996,STAR
49997,99997,QSO
49998,99998,GALAXY


In [None]:
return_dataset.to_csv("celestial_solutions.csv", index = False)