In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [None]:
DATA_FILE_PATH = "radiopulsars.csv"

urllib.request.urlretrieve(
    "https://raw.githubusercontent.com/alexandrehsd/Predicting-Pulsar-Stars/master/pulsar_stars.csv",
    DATA_FILE_PATH)
pass

In [None]:
df = pd.read_csv(DATA_FILE_PATH)
df.columns = ["IP Mean",     "IP Sd",     "IP Kurtosis",     "IP Skewness", 
              "DM-SNR Mean", "DM-SNR Sd", "DM-SNR Kurtosis", "DM-SNR Skewness",
              "target_class"]

In [None]:
df.head()

In [None]:
df["target_class"].value_counts()

In [None]:
X = df.drop(["target_class"], axis=1)
y = df["target_class"]

In [None]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
pca = PCA(n_components=2)
pca_repr = pca.fit_transform(X_scaled)
plt.scatter(
    pca_repr[:, 0],
    pca_repr[:, 1],
    c=df["target_class"].map({0: "blue", 1: "orange"}),
    alpha=0.5,
);

In [None]:
clf = LogisticRegression()

params = {
    "tol":    [1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    "C":      [0.1, 1, 10, 50, 100],
    "solver": ["newton-cg"]
}

grid_search = GridSearchCV(estimator=clf,
                           param_grid=params,
                           scoring="f1",
                           n_jobs=8)

In [None]:
cv = grid_search.fit(X_scaled, y)
cv.best_params_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1, train_size=0.8)
best_clf = LogisticRegression(C=50, solver="newton-cg", tol=1e-6)
best_clf.fit(X_train, y_train)
pred = best_clf.predict(X_test)

In [None]:
f1_score(y_test, pred)

In [None]:
cm = pd.DataFrame(data=confusion_matrix(y_test, pred),
                  columns=["Actual P", "Actual N"], 
                  index=["Predict P", "Predict N"])
sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu')
pass