### refining 3/5 timestep classifiers
#### rows 0-2, ~27.4 years

In [25]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 

In [46]:
planets = pd.read_csv('3ts.csv', skipinitialspace=True)
print(planets.columns)

Index(['run', 'planet', 'initial e', 'final e', 'mean e', 'min e', 'max e',
       'sd e', 'initial pericenter', 'final pericenter', 'mean pericenter',
       'min pericenter', 'max pericenter', 'sd pericenter', 'initial jacobi',
       'final jacobi', 'mean jacobi', 'min jacobi', 'max jacobi', 'sd jacobi',
       'initial mhr1', 'final mhr1', 'mean mhr1', 'min mhr1', 'max mhr1',
       'sd mhr1', 'initial mhr2', 'final mhr2', 'mean mhr2', 'min mhr2',
       'max mhr2', 'sd mhr2', 'initial a1_ratio', 'final a1_ratio',
       'mean a1_ratio', 'min a1_ratio', 'max a1_ratio', 'sd a1_ratio',
       'initial a2_ratio', 'final a2_ratio', 'mean a2_ratio', 'min a2_ratio',
       'max a2_ratio', 'sd a2_ratio', 'initial mass1_ratio',
       'final mass1_ratio', 'mean mass1_ratio', 'min mass1_ratio',
       'max mass1_ratio', 'sd mass1_ratio', 'initial mass2_ratio',
       'final mass2_ratio', 'mean mass2_ratio', 'min mass2_ratio',
       'max mass2_ratio', 'sd mass2_ratio', 'end time', 'fate'],


###### started at ~84% accurate

In [42]:
fates = list(set(planets['fate']))

types = {fates[i] : i for i in range(len(fates))}

classes = []

for i in range(len(planets['fate'])):
    if planets['fate'][i] == "remaining":
        classes += [0]
    else:
        classes += [1]
        
classes = np.array(classes)

###### Training

In [47]:
features_train, features_test, classes_train, classes_test = train_test_split(planets, classes, test_size=0.3, random_state=7)

ids_train = features_train['planet'].to_numpy()
features_train.drop(['run', 'planet', 'fate', 'end time'], axis=1, inplace=True)

cols = features_train.columns
features_train = features_train.to_numpy()

ids_test = features_test['planet'].to_numpy()
features_test.drop(['run', 'planet', 'fate', 'end time'], axis=1, inplace=True)
features_test = features_test.to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [13]:
classifier = GradientBoostingClassifier(random_state=7)
classifier.fit(features_train, classes_train)

GradientBoostingClassifier(random_state=7)

In [48]:
classes_predict = classifier.predict( features_test )
print('Classifier is ', accuracy_score(classes_test, classes_predict) * 100, '% accurate on testing set' )

Classifier is  82.66666666666667 % accurate on testing set


###### Refining

In [22]:
param_grid = {#'loss': ['deviance’,‘exponential'],
              'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5],
              'n_estimators': [1, 5, 10, 25, 50, 75, 100, 125, 150],
              'max_features': ['auto','sqrt','log2']}
              #'criterion': ['friedman_mse','mse','mae']}  
  
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, refit = True, verbose = 1, n_jobs=-1) 
  
# fitting the model for grid search 
grid.fit(features_train, classes_train)

# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(features_train)
  
# print classification report 
print(classification_report(classes_train, grid_predictions)) 

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 638 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1116 tasks      | elapsed:  2.0min


{'learning_rate': 0.1, 'max_features': 'sqrt', 'n_estimators': 75}
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       232
           1       0.95      0.98      0.96       468

    accuracy                           0.95       700
   macro avg       0.95      0.94      0.94       700
weighted avg       0.95      0.95      0.95       700



[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed:  2.4min finished


In [26]:
param_grid = {'learning_rate': [0.02, 0.05, 0.07, 0.1, 0.12, 0.15, 0.17, 0.2],
              'max_depth': [4, 5, 6, 7, 8],
              'n_estimators': [50, 55, 60, 65, 70, 75, 80, 85, 90],
              'max_features': ['auto','sqrt','log2']}  
  
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, refit = True, verbose = 1, n_jobs=-1) 
  
# fitting the model for grid search 
grid.fit(features_train, classes_train)

# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(features_train)
  
# print classification report 
print(classification_report(classes_train, grid_predictions)) 

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   57.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 20.9min


{'learning_rate': 0.12, 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 55}
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       232
           1       0.97      0.99      0.98       468

    accuracy                           0.98       700
   macro avg       0.98      0.97      0.97       700
weighted avg       0.98      0.98      0.98       700



[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed: 23.2min finished


In [70]:
classifier = GradientBoostingClassifier(learning_rate = 0.12, max_depth = 4, max_features = 'log2', n_estimators = 55, random_state=7)
classifier.fit(features_train, classes_train)

GradientBoostingClassifier(learning_rate=0.12, max_depth=4, max_features='log2',
                           n_estimators=55, random_state=7)

In [72]:
classes_predict = classifier.predict(features_test)
print('Classifier is ', accuracy_score(classes_test, classes_predict) * 100, '% accurate on testing set' )

Classifier is  85.66666666666667 % accurate on testing set




###### Feature importance (from old file)

In [None]:
feats=classifier.feature_importances_
inds=np.argsort(feats)[::-1]

for i in range(len(inds)):
    print(cols[inds[i]], feats[inds[i]])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pylab


f=plt.figure(figsize=(15,7))

for i in range(len(inds)):
    plt.bar(cols[inds[i]], feats[inds[i]] * 100)
    

 
plt.ylabel('% Importance', size=16)
plt.xlabel('Feature', size=16)
plt.title('Feature importance', size=20)
plt.xticks(rotation = 90)
plt.show()

In [None]:
stable_x = []
unstable_x = []
misclassified_x = []
stable_y = []
unstable_y = []
misclassified_y = []
    
    
f=plt.figure(figsize=(15,7))

for i in range(len(features_test)):
    if (classes_test[i] == 0 and classes_predict[i] == 0):
        stable_x.append(features_test[i][5])
        stable_y.append(features_test[i][39])
    elif (classes_test[i] == 1 and classes_predict[i] == 1):
        unstable_x.append(features_test[i][5])
        unstable_y.append(features_test[i][39])
    else:
        misclassified_x.append(features_test[i][5])
        misclassified_y.append(features_test[i][39])

        
plt.scatter(stable_x, stable_y, color = 'thistle')
plt.scatter(misclassified_x, misclassified_y, color = 'firebrick', zorder = 2)
plt.scatter(unstable_x, unstable_y, color = 'lightsteelblue')

        
plt.ylabel('min a2 ratio (2)')
plt.xlabel('standard deviation e (1)')
plt.title('sd e vs min a2')