In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
cols=["fLength","fWidth","fSize","fConc1","fAsym","fM3Long","fM3Trans","fAlpha","fDist","class"]
df=pd.read_csv("data/magic04.data",names=cols)
df.head()

In [None]:
df["class"]=(df["class"]=="g").astype(int)
df.head()

In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label],color='blue',label='gamma',alpha=0.7,density=True)
  plt.hist(df[df["class"]==0][label],color='red',label='hadron',alpha=0.7,density=True)
  plt.ylabel("probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

# Train,validate and test datasets

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])#sample shuffles the data
# we will scale the numerical values in the dataset,so that they belong to the same range
# flength is in 100s, whereas width is in 10s, this can affect our final results
print(len(train),len(valid),len(test))

In [None]:
def scale_dataset(dataframe,oversample=False):
  X=dataframe[dataframe.columns[:-1]].values#all the way up to last column which contains the output
  Y=dataframe[dataframe.columns[-1]].values
  scaler=StandardScaler()
  X=scaler.fit_transform(X)
  if oversample:
    ros=RandomOverSampler()
    X,Y=ros.fit_resample(X,Y)
  data=np.hstack((X,np.reshape(Y,(len(Y),-1)))) #put two arrays side by side to each other
  #inside reshape,if i put -1,its giving a error
  #-1 decides automatically
  return data ,X,Y


In [None]:
train
print(len(train[train["class"]==1]))#gammas
print(len(train[train["class"]==0]))#hadron
#we need to oversample our training dataset to increase the number of hadrons

In [None]:
train,X_train,Y_train=scale_dataset(pd.DataFrame(train),True)
valid,X_valid,Y_valid=scale_dataset(pd.DataFrame(valid))
test,X_test,Y_test=scale_dataset(pd.DataFrame(test))
#train

In [None]:
#len(Y_train)
sum(Y_train==1)
#sum(Y_train==0)

## K-nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model=KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train,Y_train)

In [None]:
Y_pred=knn_model.predict(X_test)
Y_pred

In [None]:
print(classification_report(Y_test,Y_pred))

### We can apply Naive Bayes to classification

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model=GaussianNB()
nb_model=nb_model.fit(X_train,Y_train)

In [None]:
Y_pred=nb_model.predict(X_test)
Y_pred
print(classification_report(Y_test,Y_pred))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model=LogisticRegression()#we can add a penalty in the fn parameters
# penalty is basically the loss function, l2 penalty refers to quadratic functions
lg_model=lg_model.fit(X_train,Y_train)

In [None]:
Y_pred=nb_model.predict(X_test)

In [None]:
Y_pred=lg_model.predict(X_test)
Y_pred
print(classification_report(Y_test,Y_pred))

## SVM

In [None]:
from sklearn.svm import SVC#support vector classifier

In [None]:
svm_model=SVC()
svm_model=svm_model.fit(X_train,Y_train)

In [None]:
Y_pred=svm_model.predict(X_test)
print(classification_report(Y_test,Y_pred))

## Neural Network

In [None]:
import tensorflow as tf

In [None]:
def plot_loss(history):
  #fig,(ax1,ax2)=plt.subplots(1,2)#to create side-by-side plots
  plt.plot(history.history['loss'],label='loss')
  plt.plot(history.history['val_loss'],label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Binary crossentropy')
  plt.legend()
  plt.grid(True)
  plt.show()

def plot_accuracy(history):
  plt.plot(history.history['accuracy'],label='accuracy')
  plt.plot(history.history['val_accuracy'],label='val_accuracy')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.grid(True)
  plt.show()

In [None]:
def plot_history(history):#plots both loss and accuracy
  fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))#to create side-by-side plots
  ax1.plot(history.history['loss'],label='loss')
  ax1.plot(history.history['val_loss'],label='val_loss')
  ax1.xlabel('Epoch')
  ax1.ylabel('Binary crossentropy')
  ax1.legend()
  ax1.grid(True)

  ax2.plot(history.history['loss'],label='loss')
  ax2.plot(history.history['val_loss'],label='val_loss')
  ax2.xlabel('Epoch')
  ax2.ylabel('Binary crossentropy')
  ax2.legend()
  ax2.grid(True)

  plt.show()

In [None]:
# nn_model=tf.keras.Sequential(
#     [
#         tf.keras.layers.Dense(32,activation='relu',input_shape=(9,)),
#         tf.keras.layers.Dropout(),#randomly choose at some rates certain nodes and dont train them,to prevent overfitting
#         tf.keras.layers.Dense(32,activation='relu'),
#         tf.keras.layers.Dense(1,activation='sigmoid')
#     ]
# )
# nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss='binary_crossentropy',
#                  metrics=['accuracy'])# we provide all the layers in here



In [None]:
def train_model(X_train,Y_train,num_nodes,dropout_prob,lr,batch_size,epochs):
  nn_model=tf.keras.Sequential(
    [
        tf.keras.layers.Dense(num_nodes,activation='relu',input_shape=(9,)),
        tf.keras.layers.Dropout(dropout_prob),#randomly choose at some rates certain nodes and dont train them,to prevent overfitting
        tf.keras.layers.Dense(num_nodes,activation='relu'),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1,activation='sigmoid')
    ]
  )
  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr),loss='binary_crossentropy',
                 metrics=['accuracy'])# we provide all the layers in here
  history=nn_model.fit(
    X_train,Y_train,epochs=epochs,batch_size=batch_size,validation_split=0.2,verbose=0
  )
  #instead of saying validation_split=0.2, we can just pass the validation_data=valid
  return nn_model,history

In [None]:
# history=nn_model.fit(
#     X_train,Y_train,epochs=100,batch_size=32,validation_split=0.2,verbose=0
# )

In [None]:
nn_model,history=train_model(X_train,Y_train,32,0.01,0.001,16,100)

In [None]:
plot_loss(history)
plot_accuracy(history)

In [None]:
val_loss=nn_model.evaluate(X_valid,Y_valid)
print(val_loss)

In [None]:
# least_val_loss=float('inf')
# least_loss_model=None
# epochs=100
# for num_nodes in [16,32,64]:
#   for dropout_prob in [0,0.2]:
#     for lr in [0.01 , 0.005 , 0.001]:
#       for batch_size in [32, 64, 128]:
#         print(f"{num_nodes} nodes, dropour {dropout_prob}, lr {lr}, batchsize {batch_size}")
#         model, history = train_model(X_train,Y_train,num_nodes,dropout_prob,lr,batch_size,epochs)
#         plot_history(history)
#         val_loss=model.evaluate(X_valid, Y_valid)
#         if val_loss[0]<least_val_loss:
#           least_val_loss=val_loss[0]
#           least_loss_model=model

In [None]:
Y_pred=nn_model.predict(X_test)#least_loss_model
Y_pred=(Y_pred>0.5).astype(int)
Y_pred=Y_pred.reshape(-1)
Y_pred

In [None]:
print(classification_report(Y_test,Y_pred))

## Linear Regression