In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import warnings; warnings.simplefilter('ignore')

In [2]:
# Import the data
df = pd.read_csv('C:/Users/student/Documents/pulsar_stars.csv')

In [3]:
# Assign input variables
X = df.loc[:,[' Mean of the integrated profile',' Standard deviation of the integrated profile',' Excess kurtosis of the integrated profile',' Skewness of the integrated profile',' Mean of the DM-SNR curve',' Standard deviation of the DM-SNR curve',' Excess kurtosis of the DM-SNR curve',' Skewness of the DM-SNR curve']]

In [4]:
# Assign target variable
y = df['target_class']

In [5]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [6]:
# Create a decision tree and train
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 1.0
Testing Accuracy: 0.9684357541899441


In [7]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'max_depth': range(1,10), 'criterion':['gini', 'entropy']}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'criterion': 'gini', 'max_depth': 5}
Training Accuracy: 0.9827489872887275
Testing Accuracy: 0.9762569832402235


In [8]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9793267216091633
Testing Accuracy: 0.9773743016759776


In [11]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(5,20, 2), 'learning_rate':np.linspace(0.001,1,5)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 0.25075, 'n_estimators': 17}
Training Accuracy: 0.9786283000419053
Testing Accuracy: 0.9773743016759776


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9866601480653723
Testing Accuracy: 0.9779329608938547


In [13]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(5,20, 2), 'learning_rate':np.linspace(0.001,1,5)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 0.25075, 'n_estimators': 17}
Training Accuracy: 0.9831680402290823
Testing Accuracy: 0.9782122905027933


In [14]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9976253666713228
Testing Accuracy: 0.9776536312849162


In [15]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(2,20, 2), 'max_features':np.arange(2,5)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'max_features': 4, 'n_estimators': 16}
Training Accuracy: 0.9976253666713228
Testing Accuracy: 0.9798882681564246


In [16]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9786283000419053
Testing Accuracy: 0.9720670391061452


In [17]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_neighbors': np.arange(2,20, 2)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'n_neighbors': 14}
Training Accuracy: 0.9745774549518089
Testing Accuracy: 0.973463687150838


In [18]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log', penalty='elasticnet', alpha=1, l1_ratio=.1)

model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9629836569353262
Testing Accuracy: 0.9631284916201117


In [19]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'alpha':np.linspace(0.1,10, 10), 'l1_ratio':np.linspace(0,1,10)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(SGDClassifier(loss='log', penalty='elasticnet'), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'alpha': 0.1, 'l1_ratio': 0.0}
Training Accuracy: 0.9724821902500349
Testing Accuracy: 0.9723463687150838


In [20]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9793267216091633
Testing Accuracy: 0.9762569832402235


In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score


# Create a decision tree and train
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 1.0
Rsquared on Testing 0.6200170007804986


In [22]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'max_depth': np.arange(1,100,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_depth': 6}
Rsquared on Training 0.8357202915262836
Rsquared on Testing 0.7692993724210477


In [23]:
from sklearn.ensemble import AdaBoostRegressor

# Create a decision tree and train
model = AdaBoostRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.7795268406974502
Rsquared on Testing 0.7488971534890441


In [25]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(5,20, 2), 'learning_rate':np.linspace(0.001,1,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.001, 'n_estimators': 9}
Rsquared on Training 0.7894852446184798
Rsquared on Testing 0.7694433623137225


In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score


# Create a decision tree and train
model = RandomForestRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.961025792903358
Rsquared on Testing 0.7761703591666506


In [28]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(2,20, 2), 'max_features':np.arange(2,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'max_features': 4, 'n_estimators': 16}
Rsquared on Training 0.9669144582782493
Rsquared on Testing 0.7865410697407151


In [29]:
from sklearn.ensemble import GradientBoostingRegressor

# Create a decision tree and train
model = GradientBoostingRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.8423932960801697
Rsquared on Testing 0.7872467855172145


In [30]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(5,20, 2), 'learning_rate':np.linspace(0.001,1,5)}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'learning_rate': 0.25075, 'n_estimators': 19}
Rsquared on Training 0.8240522471189999
Rsquared on Testing 0.7862971311796101


In [31]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.5270632431630402
Rsquared on Testing 0.5408410903843017


In [32]:
param_grid = {'alpha':np.linspace(0,10, 10), 'l1_ratio':np.linspace(0,1,10)}
model = GridSearchCV(ElasticNet(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'alpha': 0.0, 'l1_ratio': 0.0}
Rsquared on Training 0.6793396474544517
Rsquared on Testing 0.697307835296276


In [33]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
model.fit(x_train, y_train)

print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Rsquared on Training 0.8119978200055324
Rsquared on Testing 0.7030891461271067


In [34]:
param_grid = {'n_neighbors':np.arange(1,10)}
model = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Print the Rsquared on training and testing
print('Rsquared on Training', r2_score(y_train, model.predict(x_train)))
print('Rsquared on Testing', r2_score(y_test, model.predict(x_test)))

Best parameters are: {'n_neighbors': 9}
Rsquared on Training 0.7819276848387421
Rsquared on Testing 0.7287626675430826
