# Analysis of data set

In this file, the dataset will be viewed and visually/manually analysed to better understand the data given to us

### imports

In [1]:
import pandas as pd
import numpy as np

# for stats tests
import scipy as sp

# for plotting
import matplotlib as mpl
import matplotlib.pyplot as plt

# for machine learning
from sklearn import preprocessing, model_selection, feature_selection, ensemble, linear_model, metrics, decomposition, svm, naive_bayes
from sklearn.model_selection import train_test_split

# from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import Perceptron, ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_selector as selector
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import class_weight
from sklearn.gaussian_process import kernels

# for metric evaluations
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

### loading data set

In [2]:
dataPath = 'ophiuchus_table.tsv'
dataset = pd.read_csv(dataPath, delimiter=';',comment='#').dropna()

### quick table description/analysis of dataset

In [3]:
dataset.describe(include='all')

Unnamed: 0,Seq,Name,RAJ2000,DEJ2000,Signi070,Sp070,e_Sp070,Sp070/Sbg070,Sconv070,Stot070,...,SigniNH2,NpH2,NpH2/Nbg,NconvH2,NbgH2,FWHMaNH2,FWHMbNH2,PANH2,NSED,Coretype
count,513.0,513,513,513,513.0,513.0,513.0,513.0,513.0,513.0,...,513.0,513.0,513.0,513.0,513.0,513.0,513.0,513.0,513.0,513
unique,,513,509,511,,,,,,,...,,,,,,,,,,3
top,,162035.2-231721,16 27 56.71,-24 51 00.1,,,,,,,...,,,,,,,,,,starless
freq,,1,2,2,,,,,,,...,,,,,,,,,,320
mean,257.0,,,,62.05848,0.766368,0.041686,1.769805,1.499758,2.398127,...,68.387719,5.518519,0.330897,2.219298,11.233528,39.366472,25.31384,47.103314,0.590643,
std,148.234611,,,,593.758981,6.470035,0.14602,13.575532,13.409242,16.421507,...,343.624028,26.285607,0.591996,7.896192,8.354752,19.434943,10.588399,53.00515,1.189172,
min,1.0,,,,0.0,-0.349,0.015,-0.88,-4.57,-3.58,...,6.2,0.2,0.03,0.1,1.4,18.0,18.0,-36.0,0.0,
25%,129.0,,,,0.0,-0.00795,0.015,-0.02,-0.0378,-0.0431,...,11.0,0.7,0.11,0.3,5.8,27.0,18.0,-1.0,0.0,
50%,257.0,,,,0.0,0.0131,0.015,0.04,0.106,0.119,...,20.0,1.4,0.18,0.6,9.1,35.0,21.0,41.0,0.0,
75%,385.0,,,,4.9,0.0412,0.021,0.15,0.401,0.831,...,46.4,3.5,0.32,1.6,13.8,46.0,28.0,92.0,0.0,


In [4]:
dataset.head(4)

Unnamed: 0,Seq,Name,RAJ2000,DEJ2000,Signi070,Sp070,e_Sp070,Sp070/Sbg070,Sconv070,Stot070,...,SigniNH2,NpH2,NpH2/Nbg,NconvH2,NbgH2,FWHMaNH2,FWHMbNH2,PANH2,NSED,Coretype
0,1,162035.2-231721,16 20 35.20,-23 17 21.4,1.6,-0.0181,0.015,-0.59,-0.0656,-0.00282,...,9.7,0.6,0.3,0.2,1.9,21,19,98,0,starless
1,2,162135.3-234146,16 21 35.30,-23 41 46.8,0.0,0.00595,0.015,0.03,0.107,-0.771,...,24.1,1.1,0.24,0.9,4.8,78,48,104,0,starless
2,3,162145.2-234232,16 21 45.21,-23 42 32.6,47.0,0.758,0.017,3.55,0.861,1.48,...,50.8,2.2,0.37,0.6,5.9,23,18,-7,3,protostellar
3,4,162149.9-234306,16 21 49.94,-23 43 06.6,0.4,-0.00339,0.015,-0.01,-0.0338,0.284,...,12.4,0.9,0.17,0.5,5.1,45,29,143,0,starless


In [5]:
# x column types
x_column_types = dataset.dtypes.drop('Coretype') # we drop 'Coretype' as we'll use it for our y
x_column_types # .keys() # for the column names only

Seq           int64
Name         object
RAJ2000      object
DEJ2000      object
Signi070    float64
             ...   
NbgH2       float64
FWHMaNH2      int64
FWHMbNH2      int64
PANH2         int64
NSED          int64
Length: 62, dtype: object

In [6]:
y_unique_types = dataset['Coretype'].unique()

y_unique_types

array(['starless', 'protostellar', 'prestellar'], dtype=object)