In [1]:
import pandas as pd
import tensorflow as tf
import os

# Load the model

In [2]:
rna_balanced_model = tf.keras.models.load_model('rna_balance.keras')
rna_unbalanced_model = tf.keras.models.load_model('rna_unbalance.keras')

# Suspected Symbiotic Stars v1

## Balance

In [3]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new/candidate_symbiotic_stars_v1/normalized/Suspected_SY.csv", header=None)

# predecir las probabilidades para los datos de prueba
y_probs = rna_balanced_model.predict(df_sus_pn)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([0 for _ in range(len(df_sus_pn))], y_pred, labels=[0, 1, 2])
print(cm)

[[14  1  0]
 [ 0  0  0]
 [ 0  0  0]]


In [6]:
df_sus_sy_normalized = pd.read_csv("../../new/candidate_symbiotic_stars_v1/calibrated_data/Suspected_SY.csv")

df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_sy_normalized['source_id'])
df5.head(5)

Unnamed: 0,source_id,0,1,2,label
0,4687286621186701568,0.9898,0.006,0.0042,0
1,4651824725526390016,0.9824,0.0148,0.0029,0
2,3321366590173335424,0.5252,0.3066,0.1682,0
3,5410876219867043072,0.9998,0.0002,0.0,0
4,3575939163051304192,0.9845,0.0154,0.0,0


In [7]:
df6 = pd.read_csv('../../new/candidate_symbiotic_stars_v1/built_dataset/suspected_SY_dataset.csv')
df6.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,Gaia DR3
0,RAW 1691,LIN 521,C*,C,Gaia DR2 4687286621186701568,RAW 1691|LIN 521|2MASS J01183570-7242213|OGLE ...,C*|Em*|LP*|LP*|Em*|MIR|NIR|*|C*?|LP?,4687286621186701568
1,[BE74] 583,[BE74] 583,LongPeriodV*,G/Ke:,Gaia DR2 4651824725526390016,2MASS J05265014-7106350|EROS2-star lm058-2n-25...,LP*|Em*|NIR|V*|*,4651824725526390016
2,StHA 55,EM* StHA 55,Mira,,Gaia DR3 3321366590173335424,IRAS 05440+0642|ASAS J054642+0643.7|ASAS J0546...,Mi*|LP*|V*|LP*|SB*|LP*|MIR|V*|Em*|NIR|*|C*?|IR...,3321366590173335424
3,ZZ CMi,V* ZZ CMi,LongPeriodV*,M6I-IIep,Gaia DR3 3155368612444708096,BD+09 1633|AN 306.1934|DO 2156|GCRV 4915|G...,LP*|NIR|V*|*|IR|LP?,3155368612444708096
4,WRAY 16−51,WRAY 16-51,LongPeriodV*,M4,Gaia DR2 5410876219860836224,IRAS 09316-4621|AKARI-IRC-V1 J0933295-463450|D...,LP*|NIR|MIR|Em*|PN|*|IR,5410876219867043072


In [8]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 7, 9, 10, 11, 12]]
df_filtered.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,Gaia DR3,0,1,2,label
0,RAW 1691,LIN 521,C*,4687286621186701568,0.9898,0.006,0.0042,0
1,[BE74] 583,[BE74] 583,LongPeriodV*,4651824725526390016,0.9824,0.0148,0.0029,0
2,StHA 55,EM* StHA 55,Mira,3321366590173335424,0.5252,0.3066,0.1682,0
3,WRAY 16−51,WRAY 16-51,LongPeriodV*,5410876219867043072,0.9998,0.0002,0.0,0
4,NSV 05572,V* VX Crv,Mira,3575939163051304192,0.9845,0.0154,0.0,0


In [9]:
out_name = 'rna_balanced.csv'
out_dir = '../../new/candidate_symbiotic_stars_v1/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

## Unbalanced

In [10]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new/candidate_symbiotic_stars_v1/normalized/Suspected_SY.csv", header=None)

# predecir las probabilidades para los datos de prueba
y_probs = rna_unbalanced_model.predict(df_sus_pn)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([0 for _ in range(len(df_sus_pn))], y_pred, labels=[0, 1, 2])
print(cm)

[[13  1  1]
 [ 0  0  0]
 [ 0  0  0]]


In [11]:
df_sus_sy_normalized = pd.read_csv("../../new/candidate_symbiotic_stars_v1/calibrated_data/Suspected_SY.csv")

df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_sy_normalized['source_id'])
df5.head(5)

Unnamed: 0,source_id,0,1,2,label
0,4687286621186701568,0.8839,0.0414,0.0748,0
1,4651824725526390016,0.9334,0.0547,0.0119,0
2,3321366590173335424,0.5468,0.1763,0.2769,0
3,5410876219867043072,0.9829,0.0171,0.0,0
4,3575939163051304192,0.9787,0.0213,0.0,0


In [12]:
df6 = pd.read_csv('../../new/candidate_symbiotic_stars_v1/built_dataset/suspected_SY_dataset.csv')
df6.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,Gaia DR3
0,RAW 1691,LIN 521,C*,C,Gaia DR2 4687286621186701568,RAW 1691|LIN 521|2MASS J01183570-7242213|OGLE ...,C*|Em*|LP*|LP*|Em*|MIR|NIR|*|C*?|LP?,4687286621186701568
1,[BE74] 583,[BE74] 583,LongPeriodV*,G/Ke:,Gaia DR2 4651824725526390016,2MASS J05265014-7106350|EROS2-star lm058-2n-25...,LP*|Em*|NIR|V*|*,4651824725526390016
2,StHA 55,EM* StHA 55,Mira,,Gaia DR3 3321366590173335424,IRAS 05440+0642|ASAS J054642+0643.7|ASAS J0546...,Mi*|LP*|V*|LP*|SB*|LP*|MIR|V*|Em*|NIR|*|C*?|IR...,3321366590173335424
3,ZZ CMi,V* ZZ CMi,LongPeriodV*,M6I-IIep,Gaia DR3 3155368612444708096,BD+09 1633|AN 306.1934|DO 2156|GCRV 4915|G...,LP*|NIR|V*|*|IR|LP?,3155368612444708096
4,WRAY 16−51,WRAY 16-51,LongPeriodV*,M4,Gaia DR2 5410876219860836224,IRAS 09316-4621|AKARI-IRC-V1 J0933295-463450|D...,LP*|NIR|MIR|Em*|PN|*|IR,5410876219867043072


In [13]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 7, 9, 10, 11, 12]]
df_filtered.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,Gaia DR3,0,1,2,label
0,RAW 1691,LIN 521,C*,4687286621186701568,0.8839,0.0414,0.0748,0
1,[BE74] 583,[BE74] 583,LongPeriodV*,4651824725526390016,0.9334,0.0547,0.0119,0
2,StHA 55,EM* StHA 55,Mira,3321366590173335424,0.5468,0.1763,0.2769,0
3,WRAY 16−51,WRAY 16-51,LongPeriodV*,5410876219867043072,0.9829,0.0171,0.0,0
4,NSV 05572,V* VX Crv,Mira,3575939163051304192,0.9787,0.0213,0.0,0


In [14]:
out_name = 'rna_unbalanced.csv'
out_dir = '../../new/candidate_symbiotic_stars_v1/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

# Suspected Symbiotic Stars v2

## Balanced

In [15]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new/candidate_symbiotic_stars_v2/normalized/Suspected_SY.csv", header=None)

# predecir las probabilidades para los datos de prueba
y_probs = rna_balanced_model.predict(df_sus_pn)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([0 for _ in range(len(df_sus_pn))], y_pred, labels=[0, 1, 2])
print(cm)

[[15  1  1]
 [ 0  0  0]
 [ 0  0  0]]


In [17]:
df_sus_sy_normalized = pd.read_csv("../../new/candidate_symbiotic_stars_v2/calibrated_data/Suspected_SY.csv")

df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_sy_normalized['source_id'])
df5.head(5)

Unnamed: 0,source_id,0,1,2,label
0,6204217186929931520,1.0,0.0,0.0,0
1,4061952680197028224,0.9134,0.0799,0.0067,0
2,670455944074475008,0.7153,0.0142,0.2704,0
3,4068755633500598272,0.0642,0.1435,0.7923,2
4,2060829659152816768,0.2938,0.703,0.0032,1


In [19]:
df6 = pd.read_csv('../../new/candidate_symbiotic_stars_v2/built_dataset/suspected_SY_dataset.csv')
df6.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,Gaia DR3
0,V748 Cen,V* V748 Cen,EclBin,Ae,Gaia DR3 6204217186929931520,CD-32 10517|ALS 18924|CRTS J145936.6-332503|CS...,EB*|Ro*|NIR|V*|Em*|*,6204217186929931520
1,WRAY 16-294,WRAY 16-294,LongPeriodV*,K5,Gaia DR2 4061952680197028224,2MASS J17391381-2538050|DENIS J173913.8-253805...,LP*|PN|NIR|Em*|*|C*?|ISM|LP?,4061952680197028224
2,DASCH J075731.1+201735,ASAS J075731+2017.6,LongPeriodV*,M0III,Gaia DR2 670455944074475008,2MASS J07573112+2017347|ASAS J075731+2017.6|DA...,SB*|LP*|NIR|V*|*|Opt,670455944074475008
3,ASAS J174600-2321.3,ASAS J174600-2321.3,LongPeriodV*_Candidate,F0I,Gaia DR2 4068755633500598272,2MASS J17460018-2321163|ASAS J174600-2321.3|ER...,NIR|V*|*|LP?,4068755633500598272
4,IPHASJ201550.96+373004.2,IRAS 20140+3720,PlanetaryNeb_Candidate,,Gaia DR2 2060829659152816768,2MASS J20155096+3730042|AKARI-IRC-V1 J2015509+...,NIR|*|C*?|IR|LP?|PN?,2060829659152816768


In [20]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 7, 9, 10, 11, 12]]
df_filtered.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,Gaia DR3,0,1,2,label
0,V748 Cen,V* V748 Cen,EclBin,6204217186929931520,1.0,0.0,0.0,0
1,WRAY 16-294,WRAY 16-294,LongPeriodV*,4061952680197028224,0.9134,0.0799,0.0067,0
2,DASCH J075731.1+201735,ASAS J075731+2017.6,LongPeriodV*,670455944074475008,0.7153,0.0142,0.2704,0
3,ASAS J174600-2321.3,ASAS J174600-2321.3,LongPeriodV*_Candidate,4068755633500598272,0.0642,0.1435,0.7923,2
4,IPHASJ201550.96+373004.2,IRAS 20140+3720,PlanetaryNeb_Candidate,2060829659152816768,0.2938,0.703,0.0032,1


In [21]:
out_name = 'rna_balanced.csv'
out_dir = '../../new/candidate_symbiotic_stars_v2/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

## Unbalanced

In [22]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new/candidate_symbiotic_stars_v2/normalized/Suspected_SY.csv", header=None)

# predecir las probabilidades para los datos de prueba
y_probs = rna_unbalanced_model.predict(df_sus_pn)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([0 for _ in range(len(df_sus_pn))], y_pred, labels=[0, 1, 2])
print(cm)

[[14  2  1]
 [ 0  0  0]
 [ 0  0  0]]


In [23]:
df_sus_sy_normalized = pd.read_csv("../../new/candidate_symbiotic_stars_v2/calibrated_data/Suspected_SY.csv")

df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_sy_normalized['source_id'])
df5.head(5)

Unnamed: 0,source_id,0,1,2,label
0,6204217186929931520,0.0222,0.9778,0.0,1
1,4061952680197028224,0.8504,0.1486,0.001,0
2,670455944074475008,0.7673,0.0485,0.1841,0
3,4068755633500598272,0.0091,0.091,0.8999,2
4,2060829659152816768,0.1461,0.8369,0.017,1


In [24]:
df6 = pd.read_csv('../../new/candidate_symbiotic_stars_v2/built_dataset/suspected_SY_dataset.csv')
df6.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,Gaia DR3
0,V748 Cen,V* V748 Cen,EclBin,Ae,Gaia DR3 6204217186929931520,CD-32 10517|ALS 18924|CRTS J145936.6-332503|CS...,EB*|Ro*|NIR|V*|Em*|*,6204217186929931520
1,WRAY 16-294,WRAY 16-294,LongPeriodV*,K5,Gaia DR2 4061952680197028224,2MASS J17391381-2538050|DENIS J173913.8-253805...,LP*|PN|NIR|Em*|*|C*?|ISM|LP?,4061952680197028224
2,DASCH J075731.1+201735,ASAS J075731+2017.6,LongPeriodV*,M0III,Gaia DR2 670455944074475008,2MASS J07573112+2017347|ASAS J075731+2017.6|DA...,SB*|LP*|NIR|V*|*|Opt,670455944074475008
3,ASAS J174600-2321.3,ASAS J174600-2321.3,LongPeriodV*_Candidate,F0I,Gaia DR2 4068755633500598272,2MASS J17460018-2321163|ASAS J174600-2321.3|ER...,NIR|V*|*|LP?,4068755633500598272
4,IPHASJ201550.96+373004.2,IRAS 20140+3720,PlanetaryNeb_Candidate,,Gaia DR2 2060829659152816768,2MASS J20155096+3730042|AKARI-IRC-V1 J2015509+...,NIR|*|C*?|IR|LP?|PN?,2060829659152816768


In [25]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 7, 9, 10, 11, 12]]
df_filtered.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,Gaia DR3,0,1,2,label
0,V748 Cen,V* V748 Cen,EclBin,6204217186929931520,0.0222,0.9778,0.0,1
1,WRAY 16-294,WRAY 16-294,LongPeriodV*,4061952680197028224,0.8504,0.1486,0.001,0
2,DASCH J075731.1+201735,ASAS J075731+2017.6,LongPeriodV*,670455944074475008,0.7673,0.0485,0.1841,0
3,ASAS J174600-2321.3,ASAS J174600-2321.3,LongPeriodV*_Candidate,4068755633500598272,0.0091,0.091,0.8999,2
4,IPHASJ201550.96+373004.2,IRAS 20140+3720,PlanetaryNeb_Candidate,2060829659152816768,0.1461,0.8369,0.017,1


In [26]:
out_name = 'rna_unbalanced.csv'
out_dir = '../../new/candidate_symbiotic_stars_v2/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

# Other Stars

## Balance

In [3]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new/other_stars/normalized/Suspected.csv", header=None)

# predecir las probabilidades para los datos de prueba
y_probs = rna_balanced_model.predict(df_sus_pn)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([0 for _ in range(len(df_sus_pn))], y_pred, labels=[0, 1, 2])
print(cm)

[[4 2 1]
 [0 0 0]
 [0 0 0]]


In [4]:
df_sus_sy_normalized = pd.read_csv("../../new/other_stars/calibrated_data/Suspected.csv")

df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_sy_normalized['source_id'])
df5.head(5)

Unnamed: 0,source_id,0,1,2,label
0,3321366590173335424,0.5252,0.3066,0.1682,0
1,4557410314849153920,0.743,0.0057,0.2513,0
2,2022052808961769088,0.0131,0.9869,0.0,1
3,4052553745525657600,0.0042,0.0255,0.9703,2
4,4050670827750135040,0.2784,0.7216,0.0,1


In [5]:
df6 = pd.read_csv('../../new/other_stars/symbad/suspected.csv')
df6.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,Gaia DR3
0,StHa55,EM* StHA 55,Mira,,Gaia DR3 3321366590173335424,IRAS 05440+0642|ASAS J054642+0643.7|ASAS J0546...,Mi*|LP*|V*|LP*|SB*|LP*|MIR|V*|Em*|NIR|*|C*?|IR...,3321366590173335424
1,GH Gem,V* GH Gem,Symbiotic*,,Gaia DR3 3160625132721733888,SV* SON 3566|AN 241.1943|ATO J106.0532+12.05...,Sy*|V*|NIR|V*|*|Opt,3160625132721733888
2,V503Her,V* V503 Her,Symbiotic*,,Gaia DR3 4557410314849153920,SV* P 4385|AN 170.1936|ASAS J173640+2318.2|A...,LP*|Sy*|V*|Pu*|NIR|V*|*|Opt,4557410314849153920
3,Hen2-442,Hen 2-442,PlanetaryNeb,,Gaia DR2 2022052808961769088,Hen 2-442|AKARI-IRC-V1 J1939433+262933|GSC2 N0...,PN|NIR|MIR|V*|*|IR,2022052808961769088
4,V850 Aql,V* V850 Aql,Symbiotic*,O-rich,Gaia DR2 4263728319553777408,SV* SON 4396|CSV 4646|IRAS 19210+0032|2MASS...,Sy*|Em*|Mas|AB*|LP*|NIR|PN|V*|*|IR|LP?|Mi?,4263728319553777408


In [6]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 7, 9, 10, 11, 12]]
df_filtered.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,Gaia DR3,0,1,2,label
0,StHa55,EM* StHA 55,Mira,3321366590173335424,0.5252,0.3066,0.1682,0
1,V503Her,V* V503 Her,Symbiotic*,4557410314849153920,0.743,0.0057,0.2513,0
2,Hen2-442,Hen 2-442,PlanetaryNeb,2022052808961769088,0.0131,0.9869,0.0,1
3,Hen2-379,PN M 1-44,PlanetaryNeb,4052553745525657600,0.0042,0.0255,0.9703,2
4,AS288,PN H 2-43,PlanetaryNeb,4050670827750135040,0.2784,0.7216,0.0,1


In [8]:
out_name = 'rna_balanced.csv'
out_dir = '../../new/other_stars/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

## Unbalance

In [9]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new/other_stars/normalized/Suspected.csv", header=None)

# predecir las probabilidades para los datos de prueba
y_probs = rna_unbalanced_model.predict(df_sus_pn)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([0 for _ in range(len(df_sus_pn))], y_pred, labels=[0, 1, 2])
print(cm)

[[3 2 2]
 [0 0 0]
 [0 0 0]]


In [10]:
df_sus_sy_normalized = pd.read_csv("../../new/other_stars/calibrated_data/Suspected.csv")

df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_sy_normalized['source_id'])
df5.head(5)

Unnamed: 0,source_id,0,1,2,label
0,3321366590173335424,0.5468,0.1763,0.2769,0
1,4557410314849153920,0.0515,0.0305,0.918,2
2,2022052808961769088,0.1149,0.8851,0.0,1
3,4052553745525657600,0.0018,0.0174,0.9807,2
4,4050670827750135040,0.1836,0.8164,0.0,1


In [11]:
df6 = pd.read_csv('../../new/other_stars/symbad/suspected.csv')
df6.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,Gaia DR3
0,StHa55,EM* StHA 55,Mira,,Gaia DR3 3321366590173335424,IRAS 05440+0642|ASAS J054642+0643.7|ASAS J0546...,Mi*|LP*|V*|LP*|SB*|LP*|MIR|V*|Em*|NIR|*|C*?|IR...,3321366590173335424
1,GH Gem,V* GH Gem,Symbiotic*,,Gaia DR3 3160625132721733888,SV* SON 3566|AN 241.1943|ATO J106.0532+12.05...,Sy*|V*|NIR|V*|*|Opt,3160625132721733888
2,V503Her,V* V503 Her,Symbiotic*,,Gaia DR3 4557410314849153920,SV* P 4385|AN 170.1936|ASAS J173640+2318.2|A...,LP*|Sy*|V*|Pu*|NIR|V*|*|Opt,4557410314849153920
3,Hen2-442,Hen 2-442,PlanetaryNeb,,Gaia DR2 2022052808961769088,Hen 2-442|AKARI-IRC-V1 J1939433+262933|GSC2 N0...,PN|NIR|MIR|V*|*|IR,2022052808961769088
4,V850 Aql,V* V850 Aql,Symbiotic*,O-rich,Gaia DR2 4263728319553777408,SV* SON 4396|CSV 4646|IRAS 19210+0032|2MASS...,Sy*|Em*|Mas|AB*|LP*|NIR|PN|V*|*|IR|LP?|Mi?,4263728319553777408


In [12]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 7, 9, 10, 11, 12]]
df_filtered.head(5)

Unnamed: 0,FIND_NAME,MAIN_ID,OTYPE,Gaia DR3,0,1,2,label
0,StHa55,EM* StHA 55,Mira,3321366590173335424,0.5468,0.1763,0.2769,0
1,V503Her,V* V503 Her,Symbiotic*,4557410314849153920,0.0515,0.0305,0.918,2
2,Hen2-442,Hen 2-442,PlanetaryNeb,2022052808961769088,0.1149,0.8851,0.0,1
3,Hen2-379,PN M 1-44,PlanetaryNeb,4052553745525657600,0.0018,0.0174,0.9807,2
4,AS288,PN H 2-43,PlanetaryNeb,4050670827750135040,0.1836,0.8164,0.0,1


In [13]:
out_name = 'rna_unbalanced.csv'
out_dir = '../../new/other_stars/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

## Suspected PN

In [3]:
eligible_candidates = [
    "IRAS 02379+5724",
    "UCAC2  46104304",
    "IRAS 05495+2620",
    "IRAS 06549-2330",
    "PN G228.1+00.8",
    "PN G239.3-02.7",
    "PN Y-C   40",
    "EGB  2",
    "IRAS 10348-6320",
    "IRAS 11555-6031",
    "IRAS 11415-6540",
    "PN G292.3+00.5",
    "2MASS J05221214-6949580",
    # "[RP2006] J052216-694127",
    # "[RP2006] J052254-694036",
    "[RP2006] J052438-690413",
    "[RP2006]  312"
]

# Balance

In [4]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new_v3/normalized/NP_Candidate.csv")

# Eliminar la primera columna
df_sus_pn_data = df_sus_pn.drop(df_sus_pn.columns[0], axis=1)

# predecir las probabilidades para los datos de prueba
y_probs = rna_balanced_model.predict(df_sus_pn_data)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([1 for _ in range(len(df_sus_pn_data))], y_pred, labels=[0, 1, 2])
print(cm)

[[ 0  0  0]
 [22 85 39]
 [ 0  0  0]]


In [5]:
df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_pn['source_id'])
df5

Unnamed: 0,source_id,0,1,2,label
0,457887361083098880,0.8607,0.0888,0.0505,0
1,469648081051557376,0.0000,1.0000,0.0000,1
2,203511119746151040,0.9584,0.0372,0.0044,0
3,3430688759283627776,0.0000,0.9937,0.0063,1
4,2884159294765728768,0.0000,0.8965,0.1035,1
...,...,...,...,...,...
141,4229698606444795392,0.0015,0.9985,0.0000,1
142,6055339498610714624,0.0172,0.0483,0.9345,2
143,4043047642896333440,0.0000,0.5746,0.4254,1
144,6079563212927574528,0.0001,0.9999,0.0000,1


In [6]:
df6 = pd.read_csv('../../new_v3/symbad/NP_Candidate.csv')
df6

Unnamed: 0,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,RA,DEC,Gaia DR3
0,PN HaWe 3,PlanetaryNeb_Candidate,,Gaia DR2 434853485833190528,PN HaWe 3|2MASS J03163403+4653374|PK 147-09...,NIR|MIR|PN|*|PN?,03 16 34.0393,+46 53 37.297,434853485833190528
1,IRAS 02379+5724,PlanetaryNeb_Candidate,,Gaia DR2 457887361083098880,2MASS J02413591+5737379|IPHAS J024135.93+57373...,NIR|*|IR|PN?,02 41 35.9206,+57 37 38.021,457887361083098880
2,HD 237204,PlanetaryNeb_Candidate,B2III:,Gaia DR2 469648081051557376,HD 237204|AG+56 441|ALS 7847|AP J04002328+56...,Pu*|NIR|MIR|*|IR|PN?|PN?|bC?,04 00 23.2815,+56 54 05.757,469648081051557376
3,UCAC2 46104304,PlanetaryNeb_Candidate,,Gaia DR2 203511119746151040,UCAC2 46104304|IPHAS J044338.64+410941.5|IRAS...,Em*|NIR|*|IR|Opt|PN?,04 43 38.6416,+41 09 41.460,203511119746151040
4,IRAS 05345+2657,PlanetaryNeb_Candidate,,Gaia DR2 3441293067898788992,AKARI-IRC-V1 J0537424+265913|AKARI-FIS-V1 J053...,MIR|*|G|IR|PN?|PN?,05 37 42.3456,+26 59 14.047,3441293067898788992
...,...,...,...,...,...,...,...,...,...
366,IRAS 12481-4903,PlanetaryNeb_Candidate,,Gaia DR2 6079563212927574528,TYC 8253-562-1|AKARI-IRC-V1 J1250577-491951|GA...,MIR|NIR|*|*|G|IR|PN?|PN?,12 50 57.7794,-49 19 52.021,6079563212927574528
367,IRAS 17532-3045,PlanetaryNeb_Candidate,,Gaia DR3 4056100078497762176,UCAC4 297-134949|AKARI-IRC-V1 J1756302-304522|...,MIR|NIR|*|IR|PN?,17 56 30.2700,-30 45 22.394,4056100078497762176
368,IRAS 18321-1401,PlanetaryNeb_Candidate,,Gaia DR2 4104509513893397760,UCAC4 381-119111|AKARI-IRC-V1 J1834570-135848|...,V*|MIR|NIR|*|IR|Opt|PN?|PN?,18 34 56.9988,-13 58 48.905,4104509513893397760
369,IRAS 18340-1302,PlanetaryNeb_Candidate,,Gaia DR3 4104727698224345728,UCAC4 386-108329|AKARI-IRC-V1 J1836531-125922|...,NIR|*|IR|LP?|PN?,18 36 53.0785,-12 59 23.299,4104727698224345728


In [7]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 6, 7, 8, 10, 11, 12, 13]]
df_filtered

Unnamed: 0,MAIN_ID,OTYPE,SP_TYPE,RA,DEC,Gaia DR3,0,1,2,label
0,IRAS 02379+5724,PlanetaryNeb_Candidate,,02 41 35.9206,+57 37 38.021,457887361083098880,0.8607,0.0888,0.0505,0
1,HD 237204,PlanetaryNeb_Candidate,B2III:,04 00 23.2815,+56 54 05.757,469648081051557376,0.0000,1.0000,0.0000,1
2,UCAC2 46104304,PlanetaryNeb_Candidate,,04 43 38.6416,+41 09 41.460,203511119746151040,0.9584,0.0372,0.0044,0
3,IRAS 05495+2620,PlanetaryNeb_Candidate,,05 52 42.8154,+26 21 16.018,3430688759283627776,0.0000,0.9937,0.0063,1
4,PN K 2-12,PlanetaryNeb_Candidate,,06 02 06.8725,-37 25 25.207,2884159294765728768,0.0000,0.8965,0.1035,1
...,...,...,...,...,...,...,...,...,...,...
141,NAME PN LDu 26,PlanetaryNeb_Candidate,,20 35 22.7732,+00 12 51.090,4229698606444795392,0.0015,0.9985,0.0000,1
142,WRAY 17-57,PlanetaryNeb_Candidate,,12 53 16.3034,-61 45 26.431,6055339498610714624,0.0172,0.0483,0.9345,2
143,IRAS 18020-3207,PlanetaryNeb_Candidate,,18 05 19.1339,-32 06 59.315,4043047642896333440,0.0000,0.5746,0.4254,1
144,IRAS 12481-4903,PlanetaryNeb_Candidate,,12 50 57.7794,-49 19 52.021,6079563212927574528,0.0001,0.9999,0.0000,1


In [8]:
out_name = 'rna_balanced.csv'
out_dir = '../../new_v3/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

In [9]:
# filter by eligible candidates
eligible_np = df_filtered[df_filtered['MAIN_ID'].isin(eligible_candidates)]
eligible_np = eligible_np.reset_index(drop=True)

out_name = 'rna_balanced_elegible_np_candidates.csv'
out_dir = '../../new_v3/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
eligible_np.to_csv(fullname, header=True, index=False)

## Unbalance

In [10]:
import numpy as np
from sklearn.metrics import confusion_matrix

df_sus_pn = pd.read_csv("../../new_v3/normalized/NP_Candidate.csv")

# Eliminar la primera columna
df_sus_pn_data = df_sus_pn.drop(df_sus_pn.columns[0], axis=1)

# predecir las probabilidades para los datos de prueba
y_probs = rna_unbalanced_model.predict(df_sus_pn_data)
y_pred = np.argmax(y_probs, axis=-1)
cm = confusion_matrix([1 for _ in range(len(df_sus_pn_data))], y_pred, labels=[0, 1, 2])
print(cm)

[[ 0  0  0]
 [20 74 52]
 [ 0  0  0]]


In [11]:
df5 = pd.DataFrame(y_probs)
df5 = df5.round(4)
df5['label'] = y_pred
df5.insert(0, 'source_id', df_sus_pn['source_id'])
df5

Unnamed: 0,source_id,0,1,2,label
0,457887361083098880,0.0204,0.1176,0.8621,2
1,469648081051557376,0.0061,0.9939,0.0000,1
2,203511119746151040,0.5947,0.3960,0.0093,0
3,3430688759283627776,0.0001,0.9979,0.0021,1
4,2884159294765728768,0.0005,0.1286,0.8709,2
...,...,...,...,...,...
141,4229698606444795392,0.0039,0.9961,0.0000,1
142,6055339498610714624,0.0064,0.0353,0.9583,2
143,4043047642896333440,0.0009,0.2185,0.7806,2
144,6079563212927574528,0.0142,0.9858,0.0000,1


In [12]:
df6 = pd.read_csv('../../new_v3/symbad/NP_Candidate.csv')
df6

Unnamed: 0,MAIN_ID,OTYPE,SP_TYPE,ID_Gaia,IDS,OTYPES,RA,DEC,Gaia DR3
0,PN HaWe 3,PlanetaryNeb_Candidate,,Gaia DR2 434853485833190528,PN HaWe 3|2MASS J03163403+4653374|PK 147-09...,NIR|MIR|PN|*|PN?,03 16 34.0393,+46 53 37.297,434853485833190528
1,IRAS 02379+5724,PlanetaryNeb_Candidate,,Gaia DR2 457887361083098880,2MASS J02413591+5737379|IPHAS J024135.93+57373...,NIR|*|IR|PN?,02 41 35.9206,+57 37 38.021,457887361083098880
2,HD 237204,PlanetaryNeb_Candidate,B2III:,Gaia DR2 469648081051557376,HD 237204|AG+56 441|ALS 7847|AP J04002328+56...,Pu*|NIR|MIR|*|IR|PN?|PN?|bC?,04 00 23.2815,+56 54 05.757,469648081051557376
3,UCAC2 46104304,PlanetaryNeb_Candidate,,Gaia DR2 203511119746151040,UCAC2 46104304|IPHAS J044338.64+410941.5|IRAS...,Em*|NIR|*|IR|Opt|PN?,04 43 38.6416,+41 09 41.460,203511119746151040
4,IRAS 05345+2657,PlanetaryNeb_Candidate,,Gaia DR2 3441293067898788992,AKARI-IRC-V1 J0537424+265913|AKARI-FIS-V1 J053...,MIR|*|G|IR|PN?|PN?,05 37 42.3456,+26 59 14.047,3441293067898788992
...,...,...,...,...,...,...,...,...,...
366,IRAS 12481-4903,PlanetaryNeb_Candidate,,Gaia DR2 6079563212927574528,TYC 8253-562-1|AKARI-IRC-V1 J1250577-491951|GA...,MIR|NIR|*|*|G|IR|PN?|PN?,12 50 57.7794,-49 19 52.021,6079563212927574528
367,IRAS 17532-3045,PlanetaryNeb_Candidate,,Gaia DR3 4056100078497762176,UCAC4 297-134949|AKARI-IRC-V1 J1756302-304522|...,MIR|NIR|*|IR|PN?,17 56 30.2700,-30 45 22.394,4056100078497762176
368,IRAS 18321-1401,PlanetaryNeb_Candidate,,Gaia DR2 4104509513893397760,UCAC4 381-119111|AKARI-IRC-V1 J1834570-135848|...,V*|MIR|NIR|*|IR|Opt|PN?|PN?,18 34 56.9988,-13 58 48.905,4104509513893397760
369,IRAS 18340-1302,PlanetaryNeb_Candidate,,Gaia DR3 4104727698224345728,UCAC4 386-108329|AKARI-IRC-V1 J1836531-125922|...,NIR|*|IR|LP?|PN?,18 36 53.0785,-12 59 23.299,4104727698224345728


In [13]:
# Filtro de data frames
df_filtered = df6.merge(df5, left_on=['Gaia DR3'], right_on=['source_id'], how='inner', indicator=True)
df_filtered = df_filtered[df_filtered['_merge'] == 'both']
df_filtered = df_filtered.iloc[:, [0, 1, 2, 6, 7, 8, 10, 11, 12, 13]]
df_filtered

Unnamed: 0,MAIN_ID,OTYPE,SP_TYPE,RA,DEC,Gaia DR3,0,1,2,label
0,IRAS 02379+5724,PlanetaryNeb_Candidate,,02 41 35.9206,+57 37 38.021,457887361083098880,0.0204,0.1176,0.8621,2
1,HD 237204,PlanetaryNeb_Candidate,B2III:,04 00 23.2815,+56 54 05.757,469648081051557376,0.0061,0.9939,0.0000,1
2,UCAC2 46104304,PlanetaryNeb_Candidate,,04 43 38.6416,+41 09 41.460,203511119746151040,0.5947,0.3960,0.0093,0
3,IRAS 05495+2620,PlanetaryNeb_Candidate,,05 52 42.8154,+26 21 16.018,3430688759283627776,0.0001,0.9979,0.0021,1
4,PN K 2-12,PlanetaryNeb_Candidate,,06 02 06.8725,-37 25 25.207,2884159294765728768,0.0005,0.1286,0.8709,2
...,...,...,...,...,...,...,...,...,...,...
141,NAME PN LDu 26,PlanetaryNeb_Candidate,,20 35 22.7732,+00 12 51.090,4229698606444795392,0.0039,0.9961,0.0000,1
142,WRAY 17-57,PlanetaryNeb_Candidate,,12 53 16.3034,-61 45 26.431,6055339498610714624,0.0064,0.0353,0.9583,2
143,IRAS 18020-3207,PlanetaryNeb_Candidate,,18 05 19.1339,-32 06 59.315,4043047642896333440,0.0009,0.2185,0.7806,2
144,IRAS 12481-4903,PlanetaryNeb_Candidate,,12 50 57.7794,-49 19 52.021,6079563212927574528,0.0142,0.9858,0.0000,1


In [14]:
out_name = 'rna_unbalanced.csv'
out_dir = '../../new_v3/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
df_filtered.to_csv(fullname, header=True, index=False)

In [15]:
# filter by eligible candidates
eligible_np = df_filtered[df_filtered['MAIN_ID'].isin(eligible_candidates)]
eligible_np = eligible_np.reset_index(drop=True)

out_name = 'rna_unbalanced_elegible_np_candidates.csv'
out_dir = '../../new_v3/output'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

fullname = os.path.join(out_dir, out_name)
eligible_np.to_csv(fullname, header=True, index=False)