In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
data1 = pd.read_csv('pulsar_stars.csv')
data1.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [3]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0    Mean of the integrated profile                17898 non-null  float64
 1    Standard deviation of the integrated profile  17898 non-null  float64
 2    Excess kurtosis of the integrated profile     17898 non-null  float64
 3    Skewness of the integrated profile            17898 non-null  float64
 4    Mean of the DM-SNR curve                      17898 non-null  float64
 5    Standard deviation of the DM-SNR curve        17898 non-null  float64
 6    Excess kurtosis of the DM-SNR curve           17898 non-null  float64
 7    Skewness of the DM-SNR curve                  17898 non-null  float64
 8   target_class                                   17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [4]:
data1["target_class"].value_counts()

0    16259
1     1639
Name: target_class, dtype: int64

In [5]:
# Stratified Sampling using Scikit-learn's Stratified Shuffle Split Class
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, test_index in split.split(data1, data1["target_class"]):
    strat_train_set = data1.loc[train_index]
    strat_test_set = data1.loc[test_index]

In [6]:
 strat_test_set["target_class"].value_counts()

0    4065
1     410
Name: target_class, dtype: int64

In [7]:
 strat_train_set["target_class"].value_counts()

0    12194
1     1229
Name: target_class, dtype: int64

In [8]:
train_set = strat_train_set.drop("target_class", axis=1) # drop labels for training set
train_labels = strat_train_set["target_class"].copy()
test_set = strat_test_set.drop("target_class", axis=1) # drop labels for testing set
test_labels = strat_test_set["target_class"].copy()

In [9]:
train_labels

15026    0
14795    0
5940     0
3002     0
12431    0
        ..
6349     0
12584    0
15650    0
4596     0
5560     1
Name: target_class, Length: 13423, dtype: int64

In [10]:
from sklearn.cluster import KMeans
tr_set=train_set.T
kmeans = KMeans(n_clusters=8, random_state=42).fit(tr_set)
tr_fin=kmeans.cluster_centers_.T
te_set=test_set.T
kmeans1 = KMeans(n_clusters=8, random_state=42).fit(te_set)
te_fin=kmeans1.cluster_centers_.T

In [11]:
tr_fin.shape

(13423, 8)

In [12]:
train_labels.shape

(13423,)

In [13]:
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
parameters = {"loss" : ["hinge", "log", "squared_hinge", "modified_huber", "perceptron"],
              "alpha" : [0.0001, 0.001, 0.01, 0.1],
              "penalty" : ["l2", "l1", "elasticnet", "none"],
              "tol": [1.e-3]
             }

clf2 = SGDClassifier(early_stopping= True, max_iter=10000, random_state=42)
clf = GridSearchCV(clf2, parameters, scoring = 'balanced_accuracy', cv=10)
res1=clf.fit(tr_fin, train_labels)
res1.best_estimator_

SGDClassifier(alpha=0.01, early_stopping=True, loss='log', max_iter=10000,
              penalty='l1', random_state=42)

In [14]:
res1.best_params_

{'alpha': 0.01, 'loss': 'log', 'penalty': 'l1', 'tol': 0.001}

In [15]:
res1.best_score_

0.9066848497133823

In [16]:
y_test_pred=res1.best_estimator_.predict(te_fin)

In [17]:
import sklearn
from sklearn.metrics import accuracy_score
sklearn.metrics.accuracy_score(test_labels, y_test_pred)*100

59.41899441340782

In [18]:
#error rate=100-accuracy
100-(sklearn.metrics.accuracy_score(test_labels, y_test_pred)*100)

40.58100558659218

In [19]:
matthews_corrcoef(test_labels, y_test_pred)*100

27.88869191546068

In [20]:
from sklearn.metrics import precision_score
sklearn.metrics.precision_score(test_labels, y_test_pred, pos_label=1)*100

17.483811285846436

In [21]:
from sklearn.metrics import recall_score
sklearn.metrics.recall_score(test_labels, y_test_pred, pos_label=1)*100

92.19512195121952

In [22]:
from sklearn.metrics import f1_score
sklearn.metrics.f1_score(test_labels, y_test_pred, pos_label=1)*100

29.39346811819596