<a href="https://colab.research.google.com/github/jbae99/midterm_Fall2022/blob/main/DataSciMidterm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
#installations
!pip install -U scikit-learn
!pip install pandas
#for data handling
import pandas as pd
import numpy as np
#for stats tests
import scipy as sp
#for plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
#for machine learning
from sklearn import preprocessing, model_selection, feature_selection, ensemble, linear_model, metrics, decomposition, neighbors, svm, naive_bayes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
#for metric evaluations
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
class MLModels:
  ##global attributes for conveinence (encoder is global as it is used in several methods)
  scalerTypes = ['standard', 'minmax', 'robust']
  algorithms = ['GaussNB', 'Perceptron', 'SVM']
  __encoder = preprocessing.LabelEncoder()

  ##constructor method opens, assigns, and gives a basic summary of data file given in parameter
  ##also makes a basic barplot of data 
  def __init__(self, dataPath):
    dataset = pd.read_csv(dataPath, delimiter=';',comment='#')
    self.x = dataset[['Signi070', 'Sp070', 'e_Sp070',
       'Sp070/Sbg070', 'Sconv070', 'Stot070', 'e_Stot070', 'FWHMa070',
       'FWHMb070', 'PA070', 'Signi160', 'Sp160', 'e_Sp160', 'Sp160/Sbg160',
       'Sconv160', 'Stot160', 'e_Stot160', 'FWHMa160', 'FWHMb160', 'PA160',
       'Signi250', 'Sp250', 'e_Sp250', 'Sp250/Sbg250', 'Sconv250', 'Stot250',
       'e_Stot250', 'FWHMa250', 'FWHMb250', 'PA250', 'Signi350', 'Sp350',
       'e_Sp350', 'Sp350/Sbg350', 'Sconv350', 'Stot350', 'e_Stot350',
       'FWHMa350', 'FWHMb350', 'PA350', 'Signi500', 'Sp500', 'e_Sp500',
       'Sp500/Sbg500', 'Stot500', 'e_Stot500', 'FWHMa500', 'FWHMb500', 'PA500',
       'SigniNH2', 'NpH2', 'NpH2/Nbg', 'NconvH2', 'NbgH2', 'FWHMaNH2',
       'FWHMbNH2', 'PANH2', 'NSED']]
    self.y = dataset['Coretype']
  
    print(f'Summary Stats of Loaded Dataframe \n {self.x.describe} \n {self.y.describe}')
    

    #stripping whitespace and replacing empty values with NaN
    cat_col_sel = selector(dtype_include = object)
    for col in cat_col_sel(self.x):
      self.x[col].str.strip()
    self.x = self.x.replace(r'^\s*$', np.nan, regex=True)

    #print summary statistics of dataframe
    print(dataset.describe(include='all'))

    #make a plot


  ##method splits the data into a training set and testing set based on parameter
  def splitTestTrain(self, ratio):
    #split data and print shape of train and test values
    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size = ratio)
    print(f'Shape of original Dataframe: {self.x.shape} {self.y.shape} \n\
    Shape of training data: {self.x_train.shape} {self.y_train.shape} \n\
    Shape of testing data: {self.x_test.shape} {self.y_test.shape}')

    #imputing values in training and test data
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    imputer.fit(self.x_train)
    self.x_train = imputer.transform(self.x_train)
    self.x_test = imputer.transform(self.x_test)

    #encoding target (y) values
    self.__encoder.fit(self.y_train)
    self.y_train = self.__encoder.transform(self.y_train)
    self.y_test = self.__encoder.transform(self.y_test)

  ##method scales the data based on the parameter given
  def scaleData(self, scaleType):
    while scaleType not in self.scalerTypes:
      scaleType = input(f'\nPlease select a valid scaler type: {self.scalerTypes}')
    
    #selecting scaler object to utilize
    if scaleType == 'standard':
      self.__scaler = preprocessing.StandardScaler()
    elif scaleType == 'minmax':
      self.__scaler = preprocessing.MinMaxScaler()
    elif scaleType == 'robust':
      self.__scaler = preprocessing.RobustScaler()
    
    #scaling x data
    print(f'Scaling test and training x data using {scaleType}')
    self.__scaler.fit(self.x_train)
    self.x_train = self.__scaler.transform(self.x_train)
    self.x_test = self.__scaler.transform(self.x_test)
    self.x_train_df = pd.DataFrame(self.x_train)
    print(f'\nSummary of dataframe scaled with {scaleType}:\n{self.x_train_df.describe}')


  ##method chooses the algorithm to use for the model based on parameter given

  def classifyData(self, algorithm):
    while algorithm not in self.algorithms:
      algorithm = input(f'Please enter one of the available inputs: {self.algorithms}')

    if algorithm == 'GaussNB':
      GNB = GaussianNB()
      GNB.fit(self.x_train, self.y_train)
      self.y_pred = GNB.predict(self.x_test)

    elif algorithm == 'Perceptron':
      PT = Perceptron(tol = 0.001, random_state = 0)
      PT.fit(self.x_train, self.y_train)
      self.y_pred = PT.predict(self.x_test)

    elif algorithm == 'SVM':
      mySVM = svm.SVC()
      mySVM.fit(self.x_train, self.y_train)
      self.y_pred = mySVM.predict(self.x_test)


  ##method analyzes predicted values generated from classifyData. Prints Confusion matrix, classification report, and overall accuracy of the algorithm
  def showResults(self):

    print(f'Confusion Matrix and full Classification Report: \n{confusion_matrix(self.y_test, self.y_pred)}')
    print(classification_report(self.y_test, self.y_pred)) 

    # Evaluate label (subsets) accuracy
    print(f'Overall Accuracy of MLM: {accuracy_score(self.y_test, self.y_pred)}')

    print(f'Predicted classes: {self.__encoder.inverse_transform(self.y_pred)} \n\
    Actual classes: {self.__encoder.inverse_transform(self.y_test)}')

In [None]:
dataPath = '/content/gdrive/MyDrive/Topics Data/ophiuchus_tablea1(1).tsv'


##Create a MLModels object (loads the data from dataPath, splits x and y, and gives basic summary)
myMachine = MLModels(dataPath)

##split the data into training and testing
myMachine.splitTestTrain(0.3)

##scale the data (possible inputs are 'standard', 'minmax', and 'robust' (all found in myMachine.scaleTypes))
myMachine.scaleData('robust')

##fit models, predict outcomes, and display results of each model (available models found in myMachine.algorithms)
for model in myMachine.algorithms:
  print(f'Classifying data with {model}')
  myMachine.classifyData(model)
  print(f'Results of {model}')
  myMachine.showResults()


Summary Stats of Loaded Dataframe 
 <bound method NDFrame.describe of      Signi070    Sp070  e_Sp070  Sp070/Sbg070  Sconv070  Stot070  e_Stot070  \
0         1.6 -0.01810    0.015         -0.59   -0.0656 -0.00282      0.022   
1         0.0  0.00595    0.015          0.03    0.1070 -0.77100      0.110   
2        47.0  0.75800    0.017          3.55    0.8610  1.48000      0.038   
3         0.4 -0.00339    0.015         -0.01   -0.0338  0.28400      0.027   
4         0.0  0.00435    0.015          0.01    0.0830  4.38000      0.110   
..        ...      ...      ...           ...       ...      ...        ...   
508      48.6  0.76900    0.017          7.51    1.1900  1.18000      0.027   
509       0.0  0.00517    0.015          0.06    0.0978  0.68600      0.120   
510       0.0  0.00714    0.015          0.08    0.1310  0.72600      0.090   
511       2.9 -0.01480    0.015         -0.22   -0.2350 -0.62300      0.033   
512      39.3  0.51900    0.016          4.79    1.0700  1.28