In [1]:
# Importing the needed modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import silhouette_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from mpl_toolkits.mplot3d import Axes3D

warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# To display plots inside the iPython Notebook itself
%matplotlib inline

In [2]:
# To verify how data is orgainzed in file(to find the delimiter) and then
# use corresponding function to open the file. eg
# data could be in .csv. .tsv, excel format etc.
pathOfDataFile = "data/HTRU_2.csv"
firstFewLines = list()
noOfLinesToView = 2

with open(pathOfDataFile) as dataFile:
    firstFewLines = [next(dataFile) for i in range(noOfLinesToView)]
    for line in firstFewLines:
        print(line)

140.5625,55.68378214,-0.234571412,-0.699648398,3.199832776,19.11042633,7.975531794,74.24222492,0

102.5078125,58.88243001,0.465318154,-0.515087909,1.677257525,14.86014572,10.57648674,127.3935796,0



In [3]:
# Import the colon delimited data file into pandas dataFrame
pulsars_Df = pd.read_csv(pathOfDataFile, sep = ",", names= ["mean_intg", "sd_intg", "exc_kurt_intg", "skw_intg", "mean_dmsnr", \
                                      "sd_dmsnr", "exc_kurt_dmsnr", "skw_dmsnr", "class"])

pulsars_Df.head(2)

Unnamed: 0,mean_intg,sd_intg,exc_kurt_intg,skw_intg,mean_dmsnr,sd_dmsnr,exc_kurt_dmsnr,skw_dmsnr,class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0


In [4]:
imputedPulsarDf = pulsars_Df.copy()
imputedPulsarDf['log_mean_dmsnr'] = np.log(imputedPulsarDf['mean_dmsnr'])
imputedPulsarDf['log_sd_dmsnr'] = np.log(imputedPulsarDf['sd_dmsnr'])
imputedPulsarDf.drop(['sd_dmsnr', 'mean_dmsnr'], axis = 1, inplace = True)
imputedPulsarDf.describe()

Unnamed: 0,mean_intg,sd_intg,exc_kurt_intg,skw_intg,exc_kurt_dmsnr,skw_dmsnr,class,log_mean_dmsnr,log_sd_dmsnr
count,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0,17898.0
mean,111.079968,46.549532,0.477857,1.770279,8.303556,104.857709,0.091574,1.390669,3.079164
std,25.652935,6.843189,1.06404,6.167913,4.506092,106.51454,0.288432,1.235812,0.572203
min,5.8125,24.772042,-1.876011,-1.791886,-3.13927,-1.976976,0.0,-1.545474,1.997476
25%,100.929688,42.376018,0.027098,-0.188572,5.781506,34.960504,0.0,0.653926,2.669817
50%,115.078125,46.947479,0.22324,0.19871,8.433515,83.064556,0.0,1.030276,2.915677
75%,127.085938,51.023202,0.473325,0.927783,10.702959,139.309331,0.0,1.698228,3.347378
max,192.617188,98.778911,8.069522,68.101622,34.539844,1191.000837,1.0,5.408929,4.706302


In [5]:
# Standardize and Scale the dataset

scaler = StandardScaler()
scaledPulsarData = scaler.fit(imputedPulsarDf.drop("class", axis =1)).transform(imputedPulsarDf.drop("class", axis =1))

In [6]:
# Perform PCA and plot Scree plot

# initially set n_compoments = no of features and later look at variance explained in Scree plot
# To choose number of PCA components 

pca = PCA(n_components=8)
X_pca = pca.fit(scaledPulsarData).transform(scaledPulsarData)
print(scaledPulsarData.shape)
print(X_pca.shape)

(17898, 8)
(17898, 8)


In [7]:
### Seperate the class and predictor variables
X = scaledPulsarData
y = imputedPulsarDf['class'].values

In [8]:
## Since we have class imbalance, create a stratified Shuffle 80/20 split CV object
num_cv_iterations = 10
stratified_cv_object = StratifiedShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2, random_state=999)
                         
print(stratified_cv_object)

StratifiedShuffleSplit(n_splits=10, random_state=999, test_size=0.2,
            train_size=None)


In [9]:
###### Function Taken from Dr. Larsons Notebook ##############################

# this function definition just formats the weights into readable strings
# you can skip it without loss of generality to the Data Science content
def get_feature_names_from_weights(weights, names):
    tmp_array = []
    for comp in weights:
        tmp_string = ''
        for fidx,f in enumerate(names):
            if fidx>0 and comp[fidx]>=0:
                tmp_string+='+'
            tmp_string += '%.2f*%s ' % (comp[fidx],f[:])
        tmp_array.append(tmp_string)
    return tmp_array
  
# now let's get to the Data Analytics!
pca_weight_strings = get_feature_names_from_weights(pca.components_, imputedPulsarDf.drop("class", axis =1).columns) 

# create some pandas dataframes from the transformed outputs
df_pca = pd.DataFrame(X_pca,columns=[pca_weight_strings])
df_pca["class"] = imputedPulsarDf["class"]


## Objects
- DataFrames
    - pulsars_Df
    - imputedPulsarDf
    - scaledPulsarData

- Class Predictor Variables
    - X = scaledPulsarData
    - y = imputedPulsarDf['class'].values

- CV Object
    - stratified_cv_object