### Exercise 151: Preparing the Pulsar Dataset and Checking for Null Values
The goal of this exercise is to prepare the pulsar dataset for machine learning. 

In [1]:
import numpy as np
import pandas as pd

In [2]:
pulsar_df = pd.read_csv('HTRU_2.csv')

In [3]:
pulsar_df.head()

Unnamed: 0,140.5625,55.68378214,-0.234571412,-0.699648398,3.199832776,19.11042633,7.975531794,74.24222492,0
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0


In [8]:
pulsar_df.tail(1)

Unnamed: 0,140.5625,55.68378214,-0.234571412,-0.699648398,3.199832776,19.11042633,7.975531794,74.24222492,0
17896,57.0625,85.79734,1.406391,0.08952,188.30602,64.712562,-1.597527,1.429475,0


In [10]:
pulsar_df.shape

(17897, 9)

In [12]:
pulsar_df.columns = [ ['Mean of integrated profile', 'Standard deviation of integrated profile', 'Excess kurtosis of integrated profile', 'Skewness of integrated profile', 'Mean of DM-SNR curve', 'Standard deviation of DM-SNR curve','Excess kurtosis of DM-SNR curve', 'Skewness of DM-SNR curve','Class']]

In [14]:
pulsar_df.head()

Unnamed: 0,Mean of integrated profile,Standard deviation of integrated profile,Excess kurtosis of integrated profile,Skewness of integrated profile,Mean of DM-SNR curve,Standard deviation of DM-SNR curve,Excess kurtosis of DM-SNR curve,Skewness of DM-SNR curve,Class
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0


In [15]:
pulsar_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17897 entries, 0 to 17896
Data columns (total 9 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   (Mean of integrated profile,)                17897 non-null  float64
 1   (Standard deviation of integrated profile,)  17897 non-null  float64
 2   (Excess kurtosis of integrated profile,)     17897 non-null  float64
 3   (Skewness of integrated profile,)            17897 non-null  float64
 4   (Mean of DM-SNR curve,)                      17897 non-null  float64
 5   (Standard deviation of DM-SNR curve,)        17897 non-null  float64
 6   (Excess kurtosis of DM-SNR curve,)           17897 non-null  float64
 7   (Skewness of DM-SNR curve,)                  17897 non-null  float64
 8   (Class,)                                     17897 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [17]:
len(pulsar_df)

17897

### Exercise 152: Using Logistic Regression to Predict Data Accuracy
The goal of this exercise is to use logistic regression to predict the classification of pulsar stars

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [21]:
# Set up matrices X and y to store the predictors and response variables, respectively
X = pulsar_df.iloc[:, 0:8]
y = pulsar_df.iloc[:, 8]

In [23]:
# Write a classifier function that takes a model as its input
def clf_model(model):
    clf = model
    scores = cross_val_score(clf, X, y)
    print('Scores:', scores)
    print('Mean score:', scores.mean())

In [25]:
# Run the clf_model function with LogisticRegression() as input
clf_model(LogisticRegression(max_iter=1000))

Scores: [0.97486034 0.97988827 0.9818385  0.97736798 0.9782062 ]
Mean score: 0.9784322576919054


### Exercise 153: Using GaussianNB, KneighborsClassifier, DecisionTree Classifier, and RandomForestClassifier to predict accuracy in our Dataset

In [26]:
from sklearn.naive_bayes import GaussianNB

In [27]:
clf_model(GaussianNB())

Scores: [0.96061453 0.92374302 0.94272143 0.92847164 0.96451523]
Mean score: 0.9440131680613636


In [28]:
from sklearn.neighbors import KNeighborsClassifier
clf_model(KNeighborsClassifier())

Scores: [0.96955307 0.96927374 0.97317687 0.9706622  0.97289746]
Mean score: 0.9711126668446134


In [29]:
from sklearn.tree import DecisionTreeClassifier
clf_model(DecisionTreeClassifier())

Scores: [0.96787709 0.96731844 0.96954457 0.963677   0.9709416 ]
Mean score: 0.9678717409594453


In [30]:
from sklearn.ensemble import RandomForestClassifier
clf_model(RandomForestClassifier())

Scores: [0.97653631 0.98212291 0.97904443 0.97625035 0.9782062 ]
Mean score: 0.978432039160778


### Exercise 154: Finding the Pulsar Percentage from the Dataset
The goal of this exercise is to count the percentage of pulsars in our dataset

In [32]:
pulsar_df.Class.count()

Class    17897
dtype: int64

In [34]:
pulsar_df[pulsar_df.Class == 1].Class.count()

Class    1639
dtype: int64

In [35]:
# Obtain the percentage of pulsars
pulsar_df[pulsar_df.Class == 1].Class.count()/pulsar_df.Class.count()

Class    0.09158
dtype: float64

### Exercise 155: Confusion Matrix and Classification Report for the Pulsar Dataset
The goal of this exercise is to build a function that displays the confusion matrix along with the classification report

In [38]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [41]:
def confusion(model):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    return clf

In [43]:
confusion(LogisticRegression(max_iter=1000))

Confusion Matrix: [[4034   24]
 [  63  354]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4058
           1       0.94      0.85      0.89       417

    accuracy                           0.98      4475
   macro avg       0.96      0.92      0.94      4475
weighted avg       0.98      0.98      0.98      4475



In [44]:
confusion(KNeighborsClassifier())

Confusion Matrix: [[4028   30]
 [  81  336]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4058
           1       0.92      0.81      0.86       417

    accuracy                           0.98      4475
   macro avg       0.95      0.90      0.92      4475
weighted avg       0.97      0.98      0.97      4475



In [45]:
confusion(GaussianNB())

Confusion Matrix: [[3860  198]
 [  57  360]]
Classification Report:               precision    recall  f1-score   support

           0       0.99      0.95      0.97      4058
           1       0.65      0.86      0.74       417

    accuracy                           0.94      4475
   macro avg       0.82      0.91      0.85      4475
weighted avg       0.95      0.94      0.95      4475



In [46]:
confusion(RandomForestClassifier())

Confusion Matrix: [[4032   26]
 [  56  361]]
Classification Report:               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4058
           1       0.93      0.87      0.90       417

    accuracy                           0.98      4475
   macro avg       0.96      0.93      0.94      4475
weighted avg       0.98      0.98      0.98      4475



### Exercise 156: Using AdaBoost to Predict the Best Optimal Values
The goal of this exercise is to predict pulsars and median housing prices in Boston using AdaBoost

In [47]:
from sklearn.ensemble import AdaBoostClassifier

In [49]:
clf_model(AdaBoostClassifier())



Scores: [0.97430168 0.97988827 0.98127969 0.97597094 0.97708857]
Mean score: 0.9777058290056365


In [50]:
confusion(AdaBoostClassifier())



Confusion Matrix: [[4032   26]
 [  63  354]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4058
           1       0.93      0.85      0.89       417

    accuracy                           0.98      4475
   macro avg       0.96      0.92      0.94      4475
weighted avg       0.98      0.98      0.98      4475

