In [13]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [14]:
df = pd.read_csv('gaia_matched/JOHNSTON_MDwarfs_Crossmatch.csv')  # Replace 'your_file.csv' with your filename
pd.set_option('display.max_columns', None)  # Show all columns
# print(df.head())  # Display the first few rows to check the data
print(len(df))

#drop columns which are entirely strings
strx = df.drop(columns = (['Designation', '2MASS Number', 'STAT', 'PM?', 'M?', 'source_id','DISC PUBL','duplicated_source']))

columns_to_clean = ['+/-', 'H', 'W3']  # List the columns to process
strx[columns_to_clean] = strx[columns_to_clean].apply(pd.to_numeric, errors='coerce')

#drop columns from the original data set (best to work with just Gaia?)
strx = strx.drop(columns = (['RA','DEC','DIST','+/-','V','r','R','i','I','J','H','Ks','K',
                            'W1','W2','W3','Teff K','RAD Rj','MASS Mj','a AU','PER yr']))
# print(strx.head())

#drop columns from data set which are majority NaNs
NaNx = strx.drop(columns = ['dr2_radial_velocity','dr2_radial_velocity_error','dr2_rv_template_teff',
                            'dr2_rv_template_logg','sdssdr13','urat1','skymapper2','panstarrs1',
                            'pseudocolour','pseudocolour_error'])

# Count NaNs in each column
nan_count = NaNx.isnull().sum()

# Display columns with NaNs
print(nan_count[nan_count > 0])
print(np.max(nan_count) / len(NaNx))

# drop the exceptional columns that still have NaNs
NaNx_clean = NaNx.dropna()
print(len(NaNx_clean))

# # Select columns with string (object) data
# string_columns = strx.select_dtypes(include=['object'])
# print(string_columns)

# string_positions = strx.applymap(lambda x: isinstance(x, str)).stack()
# print(strx.stack()[string_positions])

# # print(strx['W3'][2].dtype)

597
parallax                              42
parallax_error                        42
parallax_over_error                   42
pm                                    42
pmra                                  42
pmra_error                            42
pmdec                                 42
pmdec_error                           42
ruwe                                  42
phot_g_mean_flux                       1
phot_g_mean_flux_error                 1
phot_g_mean_mag                        1
phot_bp_mean_flux                     24
phot_bp_mean_flux_error               24
phot_bp_mean_mag                      24
phot_rp_mean_flux                     20
phot_rp_mean_mag                      20
phot_bp_rp_excess_factor              24
bp_rp                                 24
phot_g_mean_mag_error                  1
phot_bp_mean_mag_error                24
phot_rp_mean_mag_error                20
phot_g_mean_mag_corrected              1
phot_g_mean_mag_error_corrected        1
phot_g_mean_

In [15]:
X = NaNx_clean.drop(columns=['SPEC'])  # Features
y = NaNx_clean['SPEC']  # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Normalize features (KNN is sensitive to scale)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# KNN classifier
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)