In [2]:
# Import required libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Read the csv

pulsar_stars = pd.read_csv("pulsar_stars.csv", delimiter=",")
pulsar_stars.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
# Print the columns

pulsar_stars.columns

Index([' Mean of the integrated profile',
       ' Standard deviation of the integrated profile',
       ' Excess kurtosis of the integrated profile',
       ' Skewness of the integrated profile', ' Mean of the DM-SNR curve',
       ' Standard deviation of the DM-SNR curve',
       ' Excess kurtosis of the DM-SNR curve', ' Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [5]:
# Trim the column names (remove leading/trailing whitespaces).

trimmed_columns = [x.strip() for x in pulsar_stars.columns]
pulsar_stars.columns = trimmed_columns
pulsar_stars.columns

Index(['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [6]:
# Rename columns

pulsar_stars.columns = [
  "mean",
  "stand_deviation",
  "kurtosis",
  "skewness",
  "mean_DM.SNR",
  "stand_deviation_DM.SNR",
  "kurtosis_DM.SNR",
  "skewness_DM.SNR",
  "class",
]
pulsar_stars.head()

Unnamed: 0,mean,stand_deviation,kurtosis,skewness,mean_DM.SNR,stand_deviation_DM.SNR,kurtosis_DM.SNR,skewness_DM.SNR,class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [7]:
# Defining training variables, we're taking all the columns
# except the last one that is our target variable that we will try to predict.

X = pulsar_stars.drop(['class'], axis=1)
y = pulsar_stars['class']

In [8]:
# Creating train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

print(f"Train size: {len(X_train)}.\nTest size: {len(X_test)}.")

Train size: 13423.
Test size: 4475.


In [9]:
# Initializing Random Forest Regressor and training the model.

rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=42)

In [10]:
# Predict the test set

y_predictions = rf.predict(X_test)

In [11]:
# Print the accurasy score in percentage terms.

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_predictions)))

Model accuracy score with 10 decision-trees : 0.9799


In [12]:
# Print the most important features

feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

In [15]:
#Print classification report

print(classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4070
           1       0.94      0.83      0.88       405

    accuracy                           0.98      4475
   macro avg       0.96      0.91      0.94      4475
weighted avg       0.98      0.98      0.98      4475



In [16]:
#exporting results to a csv file
report = classification_report(y_test, y_predictions, output_dict=True)
df = pd.DataFrame(report).transpose()
print(df)
df.to_csv('rf_conf_matrix_python.csv')

              precision    recall  f1-score      support
0              0.983479  0.994595  0.989006  4070.000000
1              0.938719  0.832099  0.882199   405.000000
accuracy       0.979888  0.979888  0.979888     0.979888
macro avg      0.961099  0.913347  0.935602  4475.000000
weighted avg   0.979428  0.979888  0.979339  4475.000000
