In [6]:
#| default_exp tycho2_ingest

# From Tycho2 source to parquet file
> converting Tycho2 source material to a smaller parquet file

In [7]:
#| hide
from nbdev.showdoc import *

##  getting the data

http://tdc-www.harvard.edu/catalogs/tycho2.format.html
https://cdsarc.cds.unistra.fr/cgi-bin/myqcat3?I/259/

## which epoch?

J2000

## Import Tycho2 dat file

importing depencies

In [8]:
import polars as pl

In [9]:
#| export
def read_tycho2(filename):
    labels = [
        "TYC123", "pflag", "RAmdeg", "DEmdeg", "pmRA", "pmDE", 
        "e_RAmdeg", "e_DEmdeg", "e_pmRA", "e_pmDE", "EpRAm", "EpDEm", 
        "Num", "q_RAmdeg", "q_DEmdeg", "q_pmRA", "q_pmDE", "BTmag", 
        "e_BTmag", "VTmag", "e_VTmag", "prox", "TYC", "HIPCCDM", 
        "RAdeg", "DEdeg", "EpRA-1990", "EpDE-1990", "e_RAdeg", "e_DEdeg", 
        "posflg", "corr"
    ]
    
    df = pl.read_csv(filename, separator='|', has_header=False, new_columns=labels, 
                     dtypes={'RAmdeg': pl.Float32, 'DEmdeg': pl.Float32, 'BTmag': pl.Float32, 'e_BTmag': pl.Float32, 'VTmag': pl.Float32,'e_VTmag': pl.Float32, 'HIPCCDM': pl.Utf8})
    return df

call the read function

In [11]:
df = read_tycho2('../support/tyc2.dat')
df.describe()

describe,TYC123,pflag,RAmdeg,DEmdeg,pmRA,pmDE,e_RAmdeg,e_DEmdeg,e_pmRA,e_pmDE,EpRAm,EpDEm,Num,q_RAmdeg,q_DEmdeg,q_pmRA,q_pmDE,BTmag,e_BTmag,VTmag,e_VTmag,prox,TYC,HIPCCDM,RAdeg,DEdeg,EpRA-1990,EpDE-1990,e_RAdeg,e_DEdeg,posflg,corr
str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,str,f64
"""count""","""2539913""","""2539913""",2539913.0,2539913.0,"""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""","""2539913""",2539913.0,2539913.0,2539913.0,2539913.0,2539913.0,"""2539913""","""2539913""",2539913.0,2539913.0,2539913.0,2539913.0,2539913.0,2539913.0,"""2539913""",2539913.0
"""null_count""","""0""","""0""",109445.0,109445.0,"""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",91.0,91.0,20.0,20.0,0.0,"""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0
"""mean""",,,188.530502,-3.479685,,,,,,,,,,,,,,12.015463,0.166654,11.177034,0.110939,921.230184,,,189.154943,-3.187942,1.691832,1.647907,63.566338,70.912997,,-0.00463
"""std""",,,100.340485,41.43108,,,,,,,,,,,,,,1.187064,0.12153,1.095533,0.070206,193.389428,,,100.321089,41.735132,0.111881,0.150998,36.611703,41.654617,,0.147292
"""min""","""0001 00008 1""",""" """,0.000339,-89.889664,""" """,""" """,""" """,""" """,""" """,""" """,""" """,""" """,""" """,""" """,""" """,""" """,""" """,2.183,0.014,1.905,0.009,3.0,""" """,""" """,0.000286,-89.889639,0.81,0.72,3.1,3.1,""" """,-0.9
"""25%""",,,105.471535,-39.00515,,,,,,,,,,,,,,11.487,0.067,10.672,0.053,999.0,,,105.74392,-39.209088,1.62,1.58,35.1,38.2,,-0.1
"""50%""",,,186.052124,-4.889748,,,,,,,,,,,,,,12.183,0.143,11.411,0.1,999.0,,,188.037015,-4.073136,1.69,1.67,61.3,67.0,,0.0
"""75%""",,,280.987732,31.3225,,,,,,,,,,,,,,12.788,0.243,11.924,0.163,999.0,,,281.426236,32.231936,1.77,1.74,87.0,98.2,,0.1
"""max""","""9537 00387 1""","""X""",359.999878,89.832336,"""-4418.0""","""10277.3""","""183""","""184""","""11.5""","""10.3""","""1992.53""","""1992.01""","""36""","""9.9""","""9.9""","""9.9""","""9.9""",16.580999,1.977,15.193,1.468,999.0,"""T""","""120404 """,359.999908,89.832322,2.13,2.36,200.0,200.0,"""P""",0.8


## Drop unneeded columns

In [12]:
def drop_columns(df):
    df = df.drop(columns= [
        "TYC123", "pflag", "pmRA", "pmDE", 
        "e_RAmdeg", "e_DEmdeg", "e_pmRA", "e_pmDE", "EpRAm", "EpDEm", 
        "Num", "q_RAmdeg", "q_DEmdeg", "q_pmRA", "q_pmDE", "prox", "TYC",
        "RAdeg", "DEdeg", "EpRA-1990", "EpDE-1990", "e_RAdeg", "e_DEdeg", 
        "posflg", "corr"
    ])
    return df
df =  drop_columns(df)
print(df)

shape: (2_539_913, 7)
┌────────────┬────────────┬────────┬─────────┬────────┬─────────┬───────────┐
│ RAmdeg     ┆ DEmdeg     ┆ BTmag  ┆ e_BTmag ┆ VTmag  ┆ e_VTmag ┆ HIPCCDM   │
│ ---        ┆ ---        ┆ ---    ┆ ---     ┆ ---    ┆ ---     ┆ ---       │
│ f32        ┆ f32        ┆ f32    ┆ f32     ┆ f32    ┆ f32     ┆ str       │
╞════════════╪════════════╪════════╪═════════╪════════╪═════════╪═══════════╡
│ 2.317505   ┆ 2.231843   ┆ 12.146 ┆ 0.158   ┆ 12.146 ┆ 0.223   ┆           │
│ 1.125582   ┆ 2.267394   ┆ 10.488 ┆ 0.038   ┆ 8.67   ┆ 0.015   ┆           │
│ 1.056865   ┆ 1.897829   ┆ 12.921 ┆ 0.335   ┆ 12.1   ┆ 0.243   ┆           │
│ 0.050598   ┆ 1.771443   ┆ 11.318 ┆ 0.07    ┆ 10.521 ┆ 0.051   ┆           │
│ …          ┆ …          ┆ …      ┆ …       ┆ …      ┆ …       ┆ …         │
│ 345.76767  ┆ -88.284042 ┆ 13.108 ┆ 0.271   ┆ 12.48  ┆ 0.196   ┆           │
│ 341.197632 ┆ -88.538872 ┆ 13.147 ┆ 0.287   ┆ 12.158 ┆ 0.177   ┆           │
│ 337.975433 ┆ -88.762932 ┆ 10.99  ┆ 0.048

## Clean the data

In [13]:
print("Null count before cleaning:\n", df.null_count())
print(df.filter(df['BTmag'].is_null()))
print(df.filter(df['VTmag'].is_null()))
# Fill BTmag with VTmag values where BTmag is null
df = df.with_columns(df["BTmag"].fill_null(df["VTmag"]))

# Fill VTmag with BTmag values where VTmag is null
df = df.with_columns(df["VTmag"].fill_null(df["BTmag"]))

# Remove rows where RAmdeg or DEmdeg is null
df = df.filter((df["RAmdeg"].is_not_null()) & (df["DEmdeg"].is_not_null()))

print(df)
print("Null count after cleaning:\n", df.null_count())

Null count before cleaning:
 shape: (1, 7)
┌────────┬────────┬───────┬─────────┬───────┬─────────┬─────────┐
│ RAmdeg ┆ DEmdeg ┆ BTmag ┆ e_BTmag ┆ VTmag ┆ e_VTmag ┆ HIPCCDM │
│ ---    ┆ ---    ┆ ---   ┆ ---     ┆ ---   ┆ ---     ┆ ---     │
│ u32    ┆ u32    ┆ u32   ┆ u32     ┆ u32   ┆ u32     ┆ u32     │
╞════════╪════════╪═══════╪═════════╪═══════╪═════════╪═════════╡
│ 109445 ┆ 109445 ┆ 91    ┆ 91      ┆ 20    ┆ 20      ┆ 0       │
└────────┴────────┴───────┴─────────┴───────┴─────────┴─────────┘
shape: (91, 7)
┌────────────┬────────────┬───────┬─────────┬────────┬─────────┬───────────┐
│ RAmdeg     ┆ DEmdeg     ┆ BTmag ┆ e_BTmag ┆ VTmag  ┆ e_VTmag ┆ HIPCCDM   │
│ ---        ┆ ---        ┆ ---   ┆ ---     ┆ ---    ┆ ---     ┆ ---       │
│ f32        ┆ f32        ┆ f32   ┆ f32     ┆ f32    ┆ f32     ┆ str       │
╞════════════╪════════════╪═══════╪═════════╪════════╪═════════╪═══════════╡
│ 63.942772  ┆ 3.26324    ┆ null  ┆ null    ┆ 11.189 ┆ 0.131   ┆           │
│ null       ┆ nul

## Calculate real Visual magnitude

### V   = VT -0.090*(BT-VT)
see Tycho2 docs for this formula

In [46]:
#df.map_rows(lambda t: (t[4] - 0.094*(t[2] - t[4]))).alias('Vmag')
df = df.with_columns((df["VTmag"] - 0.090 * (df["BTmag"] - df["VTmag"])).alias("Vmag"))

print(df)

shape: (2_430_468, 8)
┌────────────┬────────────┬────────┬─────────┬────────┬─────────┬───────────┬───────────┐
│ RAmdeg     ┆ DEmdeg     ┆ BTmag  ┆ e_BTmag ┆ VTmag  ┆ e_VTmag ┆ HIPCCDM   ┆ Vmag      │
│ ---        ┆ ---        ┆ ---    ┆ ---     ┆ ---    ┆ ---     ┆ ---       ┆ ---       │
│ f32        ┆ f32        ┆ f32    ┆ f32     ┆ f32    ┆ f32     ┆ str       ┆ f32       │
╞════════════╪════════════╪════════╪═════════╪════════╪═════════╪═══════════╪═══════════╡
│ 2.317505   ┆ 2.231843   ┆ 12.146 ┆ 0.158   ┆ 12.146 ┆ 0.223   ┆           ┆ 12.146    │
│ 1.125582   ┆ 2.267394   ┆ 10.488 ┆ 0.038   ┆ 8.67   ┆ 0.015   ┆           ┆ 8.50638   │
│ 1.056865   ┆ 1.897829   ┆ 12.921 ┆ 0.335   ┆ 12.1   ┆ 0.243   ┆           ┆ 12.026111 │
│ 0.050598   ┆ 1.771443   ┆ 11.318 ┆ 0.07    ┆ 10.521 ┆ 0.051   ┆           ┆ 10.44927  │
│ …          ┆ …          ┆ …      ┆ …       ┆ …      ┆ …       ┆ …         ┆ …         │
│ 345.76767  ┆ -88.284042 ┆ 13.108 ┆ 0.271   ┆ 12.48  ┆ 0.196   ┆           ┆ 

## Write parquet file

In [47]:
df.write_parquet('./support/tyc2.parquet')

# Read parquet file to test

In [48]:
df = pl.read_parquet('./support/tyc2.parquet')
print(df)

shape: (2_430_468, 8)
┌────────────┬────────────┬────────┬─────────┬────────┬─────────┬───────────┬───────────┐
│ RAmdeg     ┆ DEmdeg     ┆ BTmag  ┆ e_BTmag ┆ VTmag  ┆ e_VTmag ┆ HIPCCDM   ┆ Vmag      │
│ ---        ┆ ---        ┆ ---    ┆ ---     ┆ ---    ┆ ---     ┆ ---       ┆ ---       │
│ f32        ┆ f32        ┆ f32    ┆ f32     ┆ f32    ┆ f32     ┆ str       ┆ f32       │
╞════════════╪════════════╪════════╪═════════╪════════╪═════════╪═══════════╪═══════════╡
│ 2.317505   ┆ 2.231843   ┆ 12.146 ┆ 0.158   ┆ 12.146 ┆ 0.223   ┆           ┆ 12.146    │
│ 1.125582   ┆ 2.267394   ┆ 10.488 ┆ 0.038   ┆ 8.67   ┆ 0.015   ┆           ┆ 8.50638   │
│ 1.056865   ┆ 1.897829   ┆ 12.921 ┆ 0.335   ┆ 12.1   ┆ 0.243   ┆           ┆ 12.026111 │
│ 0.050598   ┆ 1.771443   ┆ 11.318 ┆ 0.07    ┆ 10.521 ┆ 0.051   ┆           ┆ 10.44927  │
│ …          ┆ …          ┆ …      ┆ …       ┆ …      ┆ …       ┆ …         ┆ …         │
│ 345.76767  ┆ -88.284042 ┆ 13.108 ┆ 0.271   ┆ 12.48  ┆ 0.196   ┆           ┆ 

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()