In [32]:
# Import required libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Read the csv

pulsar_stars = pd.read_csv("pulsar_stars.csv", delimiter=",")
pulsar_stars.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
# Print the columns

pulsar_stars.columns

Index([' Mean of the integrated profile',
       ' Standard deviation of the integrated profile',
       ' Excess kurtosis of the integrated profile',
       ' Skewness of the integrated profile', ' Mean of the DM-SNR curve',
       ' Standard deviation of the DM-SNR curve',
       ' Excess kurtosis of the DM-SNR curve', ' Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [7]:
# Trim the column names (remove leading/trailing whitespaces).

trimmed_columns = [x.strip() for x in pulsar_stars.columns]
pulsar_stars.columns = trimmed_columns
pulsar_stars.columns

Index(['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [8]:
# Rename columns

pulsar_stars.columns = [
  "mean",
  "stand_deviation",
  "kurtosis",
  "skewness",
  "mean_DM.SNR",
  "stand_deviation_DM.SNR",
  "kurtosis_DM.SNR",
  "skewness_DM.SNR",
  "class",
]
pulsar_stars.head()

Unnamed: 0,mean,stand_deviation,kurtosis,skewness,mean_DM.SNR,stand_deviation_DM.SNR,kurtosis_DM.SNR,skewness_DM.SNR,class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [13]:
# Defining training variables, we're taking all the columns
# except the last two (skewness_DM.SNR and class)

pulsar_stars2 = pulsar_stars.iloc[:, :-2]
pulsar_stars2.columns

Index(['mean', 'stand_deviation', 'kurtosis', 'skewness', 'mean_DM.SNR',
       'stand_deviation_DM.SNR', 'kurtosis_DM.SNR'],
      dtype='object')

In [27]:
# Creating train and test sets

train, test, train_labels, test_labels = train_test_split(
    pulsar_stars2, 
    pulsar_stars2.iloc[:, -2:],
    train_size=0.75, 
    random_state=42
)

print(f"Train size: {len(train)}.\nTest size: {len(test)}.")

Train size: 13423.
Test size: 4475.


In [28]:
# Initializing Random Forest Regressor and training the model.

rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train, train_labels)


RandomForestRegressor(n_estimators=1000, random_state=42)

In [30]:
# Predict the test set

predictions = rf.predict(test)

In [41]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae), it shows how the average estimate is off by the real value.
print(f'Mean Absolute Error of "{errors.columns[0]}":', round(np.mean(errors.iloc[:, 0]), 2))
print(f'Mean Absolute Error of "{errors.columns[1]}":', round(np.mean(errors.iloc[:, 1]), 2))

Mean Absolute Error of "stand_deviation_DM.SNR": 0.02
Mean Absolute Error of "kurtosis_DM.SNR": 0.04


In [44]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: stand_deviation_DM.SNR    99.89
kurtosis_DM.SNR           99.88
dtype: float64 %.
