<a href="https://colab.research.google.com/github/jbae99/midterm_Fall2022/blob/main/DataSciMidterm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [33]:
#installations
!pip install -U scikit-learn
!pip install pandas
!pip install scipy
#for data handling
import pandas as pd
import numpy as np
#for stats tests
import scipy
#for plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
#for saving optimized algorithms
import pickle
from joblib import dump,load
#for machine learning
import scipy.sparse
import scipy.optimize
import scipy.linalg
from scipy.sparse.linalg import cg, lsqr
from scipy.optimize import minimize
from sklearn import preprocessing, model_selection, feature_selection, ensemble, linear_model, metrics, decomposition, svm, naive_bayes, discriminant_analysis
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer#import Perceptron, ElasticNet, RidgeClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_selector as selector
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import class_weight
from sklearn.gaussian_process import kernels
#for metric evaluations
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
class DataCleaner:
  scalerTypes = {'standard':preprocessing.StandardScaler(), 'minmax':preprocessing.MinMaxScaler(), \
                 'robust':preprocessing.RobustScaler(with_centering = True, unit_variance = True), \
                 'quantileTransform':preprocessing.QuantileTransformer()}
  __encoder = preprocessing.LabelEncoder()

  def __init__(self, dataPath):
    dataset = pd.read_csv(dataPath, delimiter=';',comment='#')
    self.x = dataset[['Signi070', 'Sp070', 'e_Sp070',
       'Sp070/Sbg070', 'Sconv070', 'Stot070', 'e_Stot070', 'FWHMa070',
       'FWHMb070', 'PA070', 'Signi160', 'Sp160', 'e_Sp160', 'Sp160/Sbg160',
       'Sconv160', 'Stot160', 'e_Stot160', 'FWHMa160', 'FWHMb160', 'PA160',
       'Signi250', 'Sp250', 'e_Sp250', 'Sp250/Sbg250', 'Sconv250', 'Stot250',
       'e_Stot250', 'FWHMa250', 'FWHMb250', 'PA250', 'Signi350', 'Sp350',
       'e_Sp350', 'Sp350/Sbg350', 'Sconv350', 'Stot350', 'e_Stot350',
       'FWHMa350', 'FWHMb350', 'PA350', 'Signi500', 'Sp500', 'e_Sp500',
       'Sp500/Sbg500', 'Stot500', 'e_Stot500', 'FWHMa500', 'FWHMb500', 'PA500',
       'SigniNH2', 'NpH2', 'NpH2/Nbg', 'NconvH2', 'NbgH2', 'FWHMaNH2',
       'FWHMbNH2', 'PANH2', 'NSED']]
    self.y = dataset['Coretype']
  
    print(f'Summary Stats of Loaded Dataframe \n {self.x.describe} \n {self.y.describe}')
    
    #print summary statistics of dataframe
    print(dataset.describe(include='all'))

    """
    #make a plot 
    dataset.hist(figsize=(20,20))
    
    #produce heatmap 
    pear_corr = dataset.corr(method='pearson')
    fig, ax = plt.subplots(figsize=(8,8))
    im = ax.imshow(pear_corr, interpolation='nearest')
    fig.colorbar(im, orientation='vertical', fraction = 0.05)
    """

  ##Method preprocessces the data 
  def cleanAndScaleData(self, scaleType):
    #stripping whitespace and replacing empty values with NaN
    cat_col_sel = selector(dtype_include = object)
    for col in cat_col_sel(self.x):
      self.x[col].str.strip()
    self.x = self.x.replace(r'^\s*$', np.nan, regex=True)

    #imputing missing data (x) values
    imputer = KNNImputer(missing_values = np.nan, add_indicator = True, n_neighbors = 10)
    self.x = imputer.fit_transform(self.x)

    #encoding class (y) values
    self.y = self.__encoder.fit_transform(self.y)

    while scaleType not in self.scalerTypes.keys():
      scaleType = input(f'\nPlease select a valid scaler type: {self.scalerTypes.keys}')
    
    self.__scaler = self.scalerTypes[scaleType]
    #scaling x data
    print(f'Scaling test and training x data using {scaleType}')
    self.x = self.__scaler.fit_transform(self.x)
    self.x_df = pd.DataFrame(self.x)
    print(f'\nSummary of dataframe scaled with {scaleType}:')
    print(self.x_df.describe)

In [45]:
from IPython.terminal.shortcuts import newline_autoindent_outer

class MLModels:
  ##classwide dictionaries for Classifiers and for parameter dictionaries
  algorithms = {'NB':ComplementNB(), 'Perceptron':linear_model.Perceptron(), 'SVM':svm.SVC(), \
                 'LogReg':linear_model.LogisticRegression(), 'SGD':linear_model.SGDClassifier(), \
                'PassiveAggressive':linear_model.PassiveAggressiveClassifier(), 'LinearDisc':discriminant_analysis.LinearDiscriminantAnalysis(), \
                'QuadDisc':discriminant_analysis.QuadraticDiscriminantAnalysis() \
                }

  parameterDict = {"NB":{'alpha':[1e-9, 1e-6, 1e-3, 0.1, 0.5, 1, 2, 3], 'norm':[True, False]}, \
                   "Perceptron":{'penalty':[None, 'l2', 'l1'], 'alpha':[1e-9, 1e-7, 1e-5, 1e-3, 0.1], 'fit_intercept':[True, False], \
                      'shuffle':[True, False], 'n_iter_no_change':[5, 10, 15], 'class_weight':['balanced', None]}, \
                  "SVM":{'C':[0.1, 0.25, 0.5, 0.75, 1, 2, 3], 'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'rbf'], \
                      'degree':[1, 2, 3, 4], 'gamma':['scale', 'auto'], 'shrinking':[True, False], \
                      'class_weight':['balanced', None]},\
                  "LogReg":{'penalty':['l2', 'none'], 'tol':[1e-5, 1e-4, 1e-2, 1e-1], 'C':[0.25, 0.5, 1, 2], \
                      'fit_intercept':[True, False], 'class_weight':[None, 'balanced'], \
                      'solver':['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class':['auto', 'ovr', 'multinomial']}, \
                   "SGD":{'loss':['hinge', 'log_loss', 'modified_huber', 'squared_error'], 'penalty':['elasticnet', 'l1', 'l2'], \
                      'alpha':[1e-4, 1e-2, 0.1, 1, 2], 'shuffle':[True, False], 'learning_rate':['constant', 'optimal', 'invscaling', 'adaptive'], \
                      'class_weight':['balanced', None]}, \
                  "PassiveAggressive":{'C':[0.25, 0.5, 1, 2], 'n_iter_no_change':[5, 10, 15], 'shuffle':[True, False], \
                      'class_weight':['balanced', None]}, \
                    "LinearDisc":{'solver':['svd', 'lsqr', 'eigen'], 'shrinkage':[None, 'auto', 0.1, 0.5], 'tol':[1e-4, 1e-2, 0.1]}, \
                   "QuadDisc":{'reg_param':[0, 0,1, 0.5, 1], 'tol':[1e-4, 1e-3, 1e-2, 1e-1]}  \
                   }

  #Filepath for .joblib files that contain optimized classifiers 
  savePath = "/content/gdrive/MyDrive/Topics Data/Optimized Classifiers/"

  ##internal use, avoiding unecessary runs on classifiers that have already been parametricized
  __testedClassifiers = ['NB', 'Perceptron', 'SVM', 'LogReg', 'SGD', 'PassiveAggressive', 'LinearDisc']

  ##constructor method assigns X and Y data to be utilized by algorithms
  def __init__(self, X, Y):
    self.x = X
    self.y = Y

  ##method splits the data into a training set and testing set based on parameter
  def splitTestTrain(self, ratio):
    #split data and print shape of train and test sets
    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size = ratio)
    print(f'Shape of original Dataframe: {self.x.shape} {self.y.shape} \n\
    Shape of training data: {self.x_train.shape} {self.y_train.shape} \n\
    Shape of testing data: {self.x_test.shape} {self.y_test.shape}')

    #calculating and creating list of class weights (in case is it needed)
    self.y_weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(self.y_train), y = self.y_train)
    self.y_full_weights = []
    for value in self.y_train:
      self.y_full_weights.append(self.y_weights[value])

  #Method utilizes Scikitlearns GridSearchCV object to construct best parameters settings for any given classification algorithm
  def findBestParams(self):

    for algo in (self.algorithms.keys() - self.__testedClassifiers):
      myClassifier = self.algorithms[algo]
      myGrid = GridSearchCV(myClassifier, self.parameterDict[algo], scoring = 'accuracy', cv = 10)
      myGrid.fit(self.x_train, self.y_train)
      optimizedClassifier = myGrid.best_estimator_
      print(f'Optimized Parameters for {algo}: \n {optimizedClassifier.get_params}')
      print(f'Classifying Data with optimized {algo}')

      ##saving the best estimator into a joblibFile
      self.saveAlgo(algo, optimizedClassifier)
      ##utilize the best estimator on the holdout data
      self.classifyData(optimizedClassifier)

  ##method classifies data and prints results of algorithms. Algorithm parameters accepts SciKitLearn Estimator Objects
  def classifyData(self, algorithm):

    myCalibrator = CalibratedClassifierCV(algorithm)
    myCalibrator.fit(self.x_train, self.y_train)
    self.y_pred = myCalibrator.predict(self.x_test)
    self.showResults()

    ##method analyzes predicted values generated from classifyData. Prints Confusion matrix, classification report, and overall accuracy of the algorithm
  def showResults(self):
      print(f'Confusion Matrix and full Classification Report: \n{confusion_matrix(self.y_test, self.y_pred)}')
      print(classification_report(self.y_test, self.y_pred)) 

      # Evaluate label (subsets) accuracy
      print(f'Overall Accuracy of Model: {accuracy_score(self.y_test, self.y_pred)}\n')

  #saves algorithms as .joblib files
  def saveAlgo(self, algoName, algo):
    saveFile = open(self.savePath + algoName + '.joblib', 'wb')
    dump(algo, saveFile)
    saveFile.close()

  #loads and returns an algorithm with its best parameters (currently unused)
  def loadAlgo(self, algoName):
    saveFile = open(self.savePath + algoName + '.joblib', 'rb')
    loadedAlgo = load(saveFile)
    return loadedAlgo

In [46]:
dataPath = '/content/gdrive/MyDrive/Topics Data/ophiuchus_tablea1(1).tsv'

cleaner = DataCleaner(dataPath)

cleaner.cleanAndScaleData('quantileTransform')

myMachine = MLModels(cleaner.x, cleaner.y) 

myMachine.splitTestTrain(0.3)

myMachine.findBestParams()

Summary Stats of Loaded Dataframe 
 <bound method NDFrame.describe of      Signi070    Sp070  e_Sp070  Sp070/Sbg070  Sconv070  Stot070  e_Stot070  \
0         1.6 -0.01810    0.015         -0.59   -0.0656 -0.00282      0.022   
1         0.0  0.00595    0.015          0.03    0.1070 -0.77100      0.110   
2        47.0  0.75800    0.017          3.55    0.8610  1.48000      0.038   
3         0.4 -0.00339    0.015         -0.01   -0.0338  0.28400      0.027   
4         0.0  0.00435    0.015          0.01    0.0830  4.38000      0.110   
..        ...      ...      ...           ...       ...      ...        ...   
508      48.6  0.76900    0.017          7.51    1.1900  1.18000      0.027   
509       0.0  0.00517    0.015          0.06    0.0978  0.68600      0.120   
510       0.0  0.00714    0.015          0.08    0.1310  0.72600      0.090   
511       2.9 -0.01480    0.015         -0.22   -0.2350 -0.62300      0.033   
512      39.3  0.51900    0.016          4.79    1.0700  1.28

  "n_samples." % (self.n_quantiles, n_samples)



Summary of dataframe scaled with quantileTransform:
<bound method NDFrame.describe of            0         1         2         3         4         5         6   \
0    0.627930  0.154297  0.000000  0.013672  0.219727  0.322266  0.164062   
1    0.000000  0.394531  0.000000  0.447266  0.504883  0.044922  0.801758   
2    0.904297  0.921875  0.634766  0.937500  0.868164  0.851562  0.401367   
3    0.551758  0.267578  0.000000  0.268555  0.253906  0.591797  0.232422   
4    0.000000  0.345703  0.000000  0.364258  0.464844  0.933594  0.801758   
..        ...       ...       ...       ...       ...       ...       ...   
508  0.910156  0.924805  0.634766  0.970703  0.900391  0.814453  0.232422   
509  0.000000  0.371094  0.000000  0.561523  0.494141  0.718750  0.833008   
510  0.000000  0.421875  0.000000  0.620117  0.531250  0.728516  0.745117   
511  0.716797  0.196289  0.000000  0.075195  0.090820  0.054688  0.326172   
512  0.898438  0.908203  0.585938  0.953125  0.889648  0.833984  0



Optimized Parameters for QuadDisc: 
 <bound method BaseEstimator.get_params of QuadraticDiscriminantAnalysis(reg_param=1)>
Classifying Data with optimized QuadDisc
Confusion Matrix and full Classification Report: 
[[30  0 15]
 [ 4  8  5]
 [ 2  0 90]]
              precision    recall  f1-score   support

           0       0.83      0.67      0.74        45
           1       1.00      0.47      0.64        17
           2       0.82      0.98      0.89        92

    accuracy                           0.83       154
   macro avg       0.88      0.71      0.76       154
weighted avg       0.84      0.83      0.82       154

Overall Accuracy of Model: 0.8311688311688312



