In [1]:
# Data Processing
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('dataset/pulsar_data_train.csv')

In [3]:
df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,121.15625,48.372971,0.375485,-0.013165,3.168896,18.399367,7.449874,65.159298,0.0
1,76.96875,36.175557,0.712898,3.388719,2.399666,17.570997,9.414652,102.722975,0.0
2,130.585938,53.229534,0.133408,-0.297242,2.743311,22.362553,8.508364,74.031324,0.0
3,156.398438,48.865942,-0.215989,-0.171294,17.471572,,2.958066,7.197842,0.0
4,84.804688,36.117659,0.825013,3.274125,2.790134,20.618009,8.405008,76.291128,0.0


In [4]:
df.shape

(12528, 9)

In [5]:
df.dtypes

 Mean of the integrated profile                  float64
 Standard deviation of the integrated profile    float64
 Excess kurtosis of the integrated profile       float64
 Skewness of the integrated profile              float64
 Mean of the DM-SNR curve                        float64
 Standard deviation of the DM-SNR curve          float64
 Excess kurtosis of the DM-SNR curve             float64
 Skewness of the DM-SNR curve                    float64
target_class                                     float64
dtype: object

In [6]:
df.columns

Index([' Mean of the integrated profile',
       ' Standard deviation of the integrated profile',
       ' Excess kurtosis of the integrated profile',
       ' Skewness of the integrated profile', ' Mean of the DM-SNR curve',
       ' Standard deviation of the DM-SNR curve',
       ' Excess kurtosis of the DM-SNR curve', ' Skewness of the DM-SNR curve',
       'target_class'],
      dtype='object')

In [7]:
df.columns = df.columns.str.strip()

In [8]:
rename_cols = [
    'μ_IP',
    'σ_IP',
    'β2_IP',
    'Skp_IP',
    'μ_DM-SNR',
    'σ_DM-SNR',
    'β2_DM-SNR',
    'Skp_DM-SNR',
    'target_class'
]

df.columns = rename_cols

round(df.describe(),2)

Unnamed: 0,μ_IP,σ_IP,β2_IP,Skp_IP,μ_DM-SNR,σ_DM-SNR,β2_DM-SNR,Skp_DM-SNR,target_class
count,12528.0,12528.0,10793.0,12528.0,12528.0,11350.0,12528.0,11903.0,12528.0
mean,111.04,46.52,0.48,1.78,12.67,26.35,8.33,105.53,0.09
std,25.67,6.8,1.06,6.21,29.61,19.61,4.54,107.4,0.29
min,5.81,24.77,-1.74,-1.79,0.21,7.37,-3.14,-1.98,0.0
25%,100.87,42.36,0.02,-0.19,1.91,14.4,5.8,35.2,0.0
50%,115.18,46.93,0.22,0.2,2.79,18.41,8.45,83.13,0.0
75%,127.11,50.98,0.47,0.93,5.41,28.34,10.73,140.0,0.0
max,189.73,91.81,8.07,68.1,222.42,110.64,34.54,1191.0,1.0


In [9]:
df['target_class'].value_counts()

0.0    11375
1.0     1153
Name: target_class, dtype: int64

In [10]:
df.isna().sum()

μ_IP               0
σ_IP               0
β2_IP           1735
Skp_IP             0
μ_DM-SNR           0
σ_DM-SNR        1178
β2_DM-SNR          0
Skp_DM-SNR       625
target_class       0
dtype: int64

In [11]:
df.dropna(inplace=True)
df.isna().sum()

μ_IP            0
σ_IP            0
β2_IP           0
Skp_IP          0
μ_DM-SNR        0
σ_DM-SNR        0
β2_DM-SNR       0
Skp_DM-SNR      0
target_class    0
dtype: int64

In [12]:
X = df.drop(['target_class'], axis=1)

y = df['target_class']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [13]:
from sklearn.preprocessing import StandardScaler

cols = X.columns

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# instantiate classifier with default hyperparameters
svc=SVC() 


# fit classifier to training set
svc.fit(X_train,y_train)


# make predictions on test set
y_pred=svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

Model accuracy score with default hyperparameters: 0.9730


In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
params = {
    'C': [0.001, 0.02, 0.01, 0.1, 0.101, 1.0, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma' : ['scale', 'auto'],
    'decision_function_shape' : ['ovo', 'ovr']
}

svc = SVC()


grid_search = GridSearchCV(estimator=svc, param_grid=params, cv=5, scoring='accuracy')


grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}
Best Accuracy: 0.9795088269025596
