In [32]:
# Import the required libraries

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# Read the dataset

pulsar_stars = pd.read_csv("pulsar_stars.csv", delimiter=",")
pulsar_stars.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [15]:
# Rename the columns

pulsar_stars.columns = [
  "mean",
  "stand_deviation",
  "kurtosis",
  "skewness",
  "mean_DM.SNR",
  "stand_deviation_DM.SNR",
  "kurtosis_DM.SNR",
  "skewness_DM.SNR",
  "class",
]
pulsar_stars.head()

Unnamed: 0,mean,stand_deviation,kurtosis,skewness,mean_DM.SNR,stand_deviation_DM.SNR,kurtosis_DM.SNR,skewness_DM.SNR,class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [16]:
# Defining training variables, we're taking all the columns
# except the last one that is our target variable that we will try to predict.

X = pulsar_stars.drop(['class'], axis=1)
y = pulsar_stars['class']

In [20]:
# Creating train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

print(f"Train size: {len(X_train)}.\nTest size: {len(X_test)}.")

Train size: 13423.
Test size: 4475.


In [25]:
# Train the logistic regression model

log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [26]:
# Predict the target variable

y_pred = log_reg.predict(X_test)

In [29]:
# Print the accuracy score in percentage terms.

print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with 10 decision-trees : 0.9781


In [35]:
# Get importance
importance = log_reg.coef_[0]

# Summarize feature importance
for i,v in enumerate(importance):
	print(f'Feature: {X_train.columns[i]}, Score: %.4f' % (v))

Feature: mean, Score: 0.0193
Feature: stand_deviation, Score: -0.0312
Feature: kurtosis, Score: 5.8458
Feature: skewness, Score: -0.5218
Feature: mean_DM.SNR, Score: -0.0327
Feature: stand_deviation_DM.SNR, Score: 0.0614
Feature: kurtosis_DM.SNR, Score: 0.0556
Feature: skewness_DM.SNR, Score: -0.0041


In [36]:
#Print classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4070
           1       0.94      0.81      0.87       405

    accuracy                           0.98      4475
   macro avg       0.96      0.90      0.93      4475
weighted avg       0.98      0.98      0.98      4475

