In [1]:
import pandas as pd

PULSAR = '/kaggle/input/pulsar-classification-for-class-prediction-cleaned/Pulsar_cleaned.csv'

df = pd.read_csv(filepath_or_buffer=PULSAR)
df.head()

Unnamed: 0,Mean_Integrated,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,140.5625,1.502969,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,0.788423,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,0.323558,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,0.958983,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,1.232198,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [2]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)

COLUMNS = ['Mean_Integrated', 'EK', 'Skewness', 'Mean_DMSNR_Curve','SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve',]
for column in COLUMNS:
    express.histogram(data_frame=df, x=column, color='Class', facet_col='Class').show()

Our classes are unbalanced, obviously, and some of our variables are distributed differently depending on the class. This is probably a tough classification problem.

In [3]:
df['Class'].value_counts(normalize=True)

Class
0    0.979449
1    0.020551
Name: proportion, dtype: float64

In [4]:
import arrow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

TARGET = 'Class'
X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.20, random_state=2024, stratify=df[TARGET])

time_start = arrow.now()
regression = LogisticRegression(max_iter=1000, tol=1e-12).fit(X=X_train, y=y_train)
print('model fit in {} iterations took {}'.format(regression.n_iter_[0], arrow.now() - time_start))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=regression.predict(X=X_test))))
print('model done in {}'.format(arrow.now() - time_start))

model fit in 133 iterations took 0:00:00.164984
accuracy: 0.9833
model done in 0:00:00.169964


In [5]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=regression.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2936
           1       0.88      0.23      0.36        62

    accuracy                           0.98      2998
   macro avg       0.93      0.61      0.68      2998
weighted avg       0.98      0.98      0.98      2998



This is a tough problem; we can get 0.97+ accuracy and not get any of the class = 1 correct.

In [6]:
express.histogram(x=COLUMNS, y=regression.coef_[0]).show()

In [7]:
import arrow
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score


time_start = arrow.now()
tree = DecisionTreeClassifier(max_depth=7, random_state=2024)
tree.fit(X=X_train, y=y_train)
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=tree.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(y_true=y_test, y_pred=tree.predict(X=X_test))))
print('done in {}'.format(arrow.now() - time_start,))


accuracy: 0.9850
f1: 0.5361
done in 0:00:00.092501


In [8]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=tree.predict(X=X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2936
           1       0.74      0.42      0.54        62

    accuracy                           0.98      2998
   macro avg       0.87      0.71      0.76      2998
weighted avg       0.98      0.98      0.98      2998

