In [3]:
# Step 1: Setup and Download Dataset

# Install necessary packages
!pip install kaggle tsfresh xgboost imbalanced-learn shap





In [1]:
import os
import json
import pandas as pd
import numpy as np



In [3]:
# Kaggle API credentials
#api_token = {"username":"pearlnarang","key":"91d31420bce9fea48964d3e3bfb2b028"}

# Create Kaggle directory if it doesn't exist
#+os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

# Save API token to kaggle.json
#with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as file:
 #   json.dump(api_token, file)

# Set permissions for the kaggle.json file
#os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)



In [5]:
#import os
#import pandas as pd

# Set the directory containing your files
#directory = 'Desktop/My Practice/PLAsTiCC-2018.zip'

# Loop through all files in the directory
#for filename in os.listdir(directory):
 #   if filename.endswith(".csv"):  # Check if the file is a .csv file
  #      file_path = os.path.join(directory, filename)  # Full path to the file
        
        # Read the CSV file into a DataFrame
   #     df = pd.read_csv(file_path)
        
    #    print(f"Loaded: {filename}")  # Optional: Print which file was loaded
        # Do something with the DataFrame, e.g., store it, process it, etc.


In [7]:
# Download the PLAsTiCC dataset
#!kaggle competitions download -c PLAsTiCC-2018
#!unzip -o '*.zip' -d ./

In [9]:
# Step 2: Load and Preprocess the Dataset

# Load the dataset
train_data = pd.read_csv('training_set.csv')
train_metadata = pd.read_csv('training_set_metadata.csv')

# Merge light curves with their metadata
merged_data = pd.merge(train_data, train_metadata, on='object_id')



In [10]:
# Feature Engineering using tsfresh
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters

# Preserve the 'object_id' before extraction
object_ids = merged_data[['object_id']].drop_duplicates().reset_index(drop=True)

# Extract features automatically
extracted_features = extract_features(merged_data, column_id="object_id", column_sort="mjd", 
                                      column_value="flux", default_fc_parameters=ComprehensiveFCParameters())

# Reset index and manually add 'object_id'
extracted_features.reset_index(drop=True, inplace=True)
extracted_features = pd.concat([object_ids, extracted_features], axis=1)

# Ensure that 'object_id' is now included
print("First few rows of extracted_features:")
print(extracted_features.head())
print("Columns in extracted_features after adding object_id:")
print(extracted_features.columns)

# Ensure 'object_id' is present in both DataFrames before merging
print("Columns in train_metadata before merge:")
print(train_metadata.columns)

# Merge with metadata for labels
try:
    features = pd.merge(extracted_features, train_metadata[['object_id', 'target']], on='object_id')
    print("Merge successful. Columns in features:")
    print(features.columns)
except KeyError as e:
    print(f"KeyError: {e}. Check if 'object_id' exists in both DataFrames.")
    features = None  # Set features to None to avoid proceeding with errors

Feature Extraction: 100%|██████████| 40/40 [16:27<00:00, 24.70s/it]  


First few rows of extracted_features:
   object_id  flux__variance_larger_than_standard_deviation  \
0        615                                            1.0   
1        713                                            1.0   
2        730                                            1.0   
3        745                                            1.0   
4       1124                                            1.0   

   flux__has_duplicate_max  flux__has_duplicate_min  flux__has_duplicate  \
0                      0.0                      0.0                  0.0   
1                      0.0                      0.0                  0.0   
2                      0.0                      0.0                  0.0   
3                      0.0                      0.0                  0.0   
4                      0.0                      0.0                  0.0   

   flux__sum_values  flux__abs_energy  flux__mean_abs_change  \
0     -43330.143249      5.985203e+07             202.114067  

In [13]:
# Step 3: Model Selection using XGBoost
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [19]:
# Proceed only if merge was successful
if features is not None:
   # Split data into training and testing sets
    X = features.drop(['target', 'object_id'], axis=1)
    y = features['target']

    # Create a mapping from original labels to consecutive integers
    unique_labels = np.unique(y)
    label_mapping = {label: idx for idx, label in enumerate(sorted(unique_labels))}
    
    # Apply the mapping to encode the labels
    y_encoded = y.map(label_mapping)

    # Verify the classes after encoding
    print(f"Classes after encoding: {sorted(label_mapping.keys())}")
    print(f"Encoded classes: {sorted(label_mapping.values())}")


    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Initialize XGBoost model
    xgb_model = xgb.XGBClassifier(objective="multi:softprob", eval_metric="mlogloss")

    # Train the model
    xgb_model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = xgb_model.predict(X_test)

    # Decode the predictions back to original labels
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    y_pred_decoded = np.array([reverse_label_mapping[p] for p in y_pred])
    y_test_decoded = np.array([reverse_label_mapping[t] for t in y_test])

    # Evaluate the model
    print(f"Accuracy: {accuracy_score(y_test_decoded, y_pred_decoded)}")
    print(classification_report(y_test_decoded, y_pred_decoded, zero_division=0))

else:
    print("Skipping model training due to previous errors.")

# Step 4: Hyperparameter Tuning using GridSearchCV

from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}



Classes after encoding: [6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95]
Encoded classes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
Accuracy: 0.6898089171974522
              precision    recall  f1-score   support

           6       0.75      0.56      0.64        32
          15       0.54      0.29      0.38        97
          16       0.97      0.96      0.97       198
          42       0.48      0.42      0.45       222
          52       0.00      0.00      0.00        46
          53       1.00      0.91      0.95        11
          62       0.64      0.24      0.34        89
          64       0.50      0.22      0.31        18
          65       0.89      0.95      0.92       195
          67       0.50      0.03      0.06        32
          88       0.97      0.92      0.94        73
          90       0.57      0.87      0.69       471
          92       0.98      1.00      0.99        43
          95       0.91      0.23      0.37        43

    accuracy   