In [1]:
# Import requisite libraries
import dotenv
dotenv.load_dotenv()
import json
import numpy as np
import os
import pandas as pd
from uuid import uuid4

In [2]:
# Establish data path
DATA_PATH = os.environ.get("DATA_PATH")

In [3]:
# Load data
df_raw = pd.read_csv(DATA_PATH)
df = df_raw.copy(deep=True)
df.head()

Unnamed: 0,id,name,absolute_magnitude_h,estimated_diameter_min_km,estimated_diameter_max_km,is_potentially_hazardous_asteroid,orbit_id,orbit_determination_date,first_observation_date,last_observation_date,semi_major_axis,inclination,orbit_class_type
0,id,name,absolute_magnitude_h,estimated_diameter_min_km,estimated_diameter_max_km,is_potentially_hazardous_asteroid,orbit_id,orbit_determination_date,first_observation_date,last_observation_date,semi_major_axis,inclination,orbit_class_type
1,2000433,433 Eros (A898 PA),10.41,22.0067027115,49.2084832235,False,659,2021-05-24 17:55:05,1893-10-29,2021-05-13,1.45815896084448,10.82830761253864,AMO
2,2000719,719 Albert (A911 TB),15.59,2.0256060086,4.529392731,False,270,2025-02-28 05:16:36,1911-10-04,2025-02-28,2.636355360832282,11.57399177654622,AMO
3,2000887,887 Alinda (A918 AA),13.8,4.6190746028,10.328564805,False,608,2025-03-01 05:16:49,1918-02-09,2025-03-01,2.473267631334386,9.399886845047668,AMO
4,2001036,1036 Ganymed (A924 UB),9.18,38.7752830381,86.70416872,False,1361,2025-03-01 05:16:50,1924-10-23,2025-02-28,2.665800727730635,26.68214802212515,AMO


In [4]:
# Remove first row that repeats column names
df.drop([0], axis=0, inplace=True)
df.head()

Unnamed: 0,id,name,absolute_magnitude_h,estimated_diameter_min_km,estimated_diameter_max_km,is_potentially_hazardous_asteroid,orbit_id,orbit_determination_date,first_observation_date,last_observation_date,semi_major_axis,inclination,orbit_class_type
1,2000433,433 Eros (A898 PA),10.41,22.0067027115,49.2084832235,False,659,2021-05-24 17:55:05,1893-10-29,2021-05-13,1.45815896084448,10.82830761253864,AMO
2,2000719,719 Albert (A911 TB),15.59,2.0256060086,4.529392731,False,270,2025-02-28 05:16:36,1911-10-04,2025-02-28,2.636355360832282,11.57399177654622,AMO
3,2000887,887 Alinda (A918 AA),13.8,4.6190746028,10.328564805,False,608,2025-03-01 05:16:49,1918-02-09,2025-03-01,2.473267631334386,9.399886845047668,AMO
4,2001036,1036 Ganymed (A924 UB),9.18,38.7752830381,86.70416872,False,1361,2025-03-01 05:16:50,1924-10-23,2025-02-28,2.665800727730635,26.68214802212515,AMO
5,2001221,1221 Amor (1932 EA1),17.37,0.8923905787,1.9954459964,False,143,2025-02-20 05:51:07,1932-03-12,2025-02-19,1.919760465125785,11.86862228590597,AMO


In [5]:
# View null count in columns
df_raw.isna().sum()

id                                   0
name                                 0
absolute_magnitude_h                 0
estimated_diameter_min_km            0
estimated_diameter_max_km            0
is_potentially_hazardous_asteroid    0
orbit_id                             0
orbit_determination_date             0
first_observation_date               0
last_observation_date                0
semi_major_axis                      0
inclination                          0
orbit_class_type                     0
dtype: int64

In [6]:
# View column names
print(f"{', '. join(df.columns)}")

id, name, absolute_magnitude_h, estimated_diameter_min_km, estimated_diameter_max_km, is_potentially_hazardous_asteroid, orbit_id, orbit_determination_date, first_observation_date, last_observation_date, semi_major_axis, inclination, orbit_class_type


In [7]:
df.rename(columns={
    "estimated_diameter_min_km": "estimated_diameter_min",
    "estimated_diameter_max_km" : "estimated_diameter_max",
    "is_potentially_hazardous_asteroid": "is_potentially_hazardous"
}, inplace=True)
df

Unnamed: 0,id,name,absolute_magnitude_h,estimated_diameter_min,estimated_diameter_max,is_potentially_hazardous,orbit_id,orbit_determination_date,first_observation_date,last_observation_date,semi_major_axis,inclination,orbit_class_type
1,2000433,433 Eros (A898 PA),10.41,22.0067027115,49.2084832235,False,659,2021-05-24 17:55:05,1893-10-29,2021-05-13,1.45815896084448,10.82830761253864,AMO
2,2000719,719 Albert (A911 TB),15.59,2.0256060086,4.529392731,False,270,2025-02-28 05:16:36,1911-10-04,2025-02-28,2.636355360832282,11.57399177654622,AMO
3,2000887,887 Alinda (A918 AA),13.8,4.6190746028,10.328564805,False,608,2025-03-01 05:16:49,1918-02-09,2025-03-01,2.473267631334386,9.399886845047668,AMO
4,2001036,1036 Ganymed (A924 UB),9.18,38.7752830381,86.70416872,False,1361,2025-03-01 05:16:50,1924-10-23,2025-02-28,2.665800727730635,26.68214802212515,AMO
5,2001221,1221 Amor (1932 EA1),17.37,0.8923905787,1.9954459964,False,143,2025-02-20 05:51:07,1932-03-12,2025-02-19,1.919760465125785,11.86862228590597,AMO
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7096,3484928,(2010 AH3),26.3,0.0146067964,0.0326617897,False,9,2021-04-15 03:21:30,2010-01-08,2010-01-14,1.661462328032299,2.001210420281934,APO
7097,3484929,(2010 AJ3),24.5,0.0334622374,0.0748238376,False,11,2021-04-15 03:21:31,2010-01-08,2010-01-17,1.225501486737203,16.93316333002113,APO
7098,3485259,(2010 AE30),23.6,0.0506471459,0.1132504611,False,21,2021-04-15 03:22:01,2009-12-18,2020-01-13,1.267623303629749,5.966991496469769,APO
7099,3485260,(2010 AF30),21.73,0.1198270801,0.2679414966,True,27,2024-11-11 05:24:13,2010-01-10,2024-11-11,1.325203892099259,3.058187130506377,APO


In [8]:
# Reformat dates to include time
def add_time_to_date(a):
    return f"{a} 00:00:00"
    
df["first_observation_date"] = df.loc[:, "first_observation_date"].apply(lambda date: add_time_to_date(date))
df["last_observation_date"] = df.loc[:, "last_observation_date"].apply(lambda date: add_time_to_date(date))
df

Unnamed: 0,id,name,absolute_magnitude_h,estimated_diameter_min,estimated_diameter_max,is_potentially_hazardous,orbit_id,orbit_determination_date,first_observation_date,last_observation_date,semi_major_axis,inclination,orbit_class_type
1,2000433,433 Eros (A898 PA),10.41,22.0067027115,49.2084832235,False,659,2021-05-24 17:55:05,1893-10-29 00:00:00,2021-05-13 00:00:00,1.45815896084448,10.82830761253864,AMO
2,2000719,719 Albert (A911 TB),15.59,2.0256060086,4.529392731,False,270,2025-02-28 05:16:36,1911-10-04 00:00:00,2025-02-28 00:00:00,2.636355360832282,11.57399177654622,AMO
3,2000887,887 Alinda (A918 AA),13.8,4.6190746028,10.328564805,False,608,2025-03-01 05:16:49,1918-02-09 00:00:00,2025-03-01 00:00:00,2.473267631334386,9.399886845047668,AMO
4,2001036,1036 Ganymed (A924 UB),9.18,38.7752830381,86.70416872,False,1361,2025-03-01 05:16:50,1924-10-23 00:00:00,2025-02-28 00:00:00,2.665800727730635,26.68214802212515,AMO
5,2001221,1221 Amor (1932 EA1),17.37,0.8923905787,1.9954459964,False,143,2025-02-20 05:51:07,1932-03-12 00:00:00,2025-02-19 00:00:00,1.919760465125785,11.86862228590597,AMO
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7096,3484928,(2010 AH3),26.3,0.0146067964,0.0326617897,False,9,2021-04-15 03:21:30,2010-01-08 00:00:00,2010-01-14 00:00:00,1.661462328032299,2.001210420281934,APO
7097,3484929,(2010 AJ3),24.5,0.0334622374,0.0748238376,False,11,2021-04-15 03:21:31,2010-01-08 00:00:00,2010-01-17 00:00:00,1.225501486737203,16.93316333002113,APO
7098,3485259,(2010 AE30),23.6,0.0506471459,0.1132504611,False,21,2021-04-15 03:22:01,2009-12-18 00:00:00,2020-01-13 00:00:00,1.267623303629749,5.966991496469769,APO
7099,3485260,(2010 AF30),21.73,0.1198270801,0.2679414966,True,27,2024-11-11 05:24:13,2010-01-10 00:00:00,2024-11-11 00:00:00,1.325203892099259,3.058187130506377,APO


In [9]:
# Get unique orbit class types
orbit_class_types = sorted(df["orbit_class_type"].unique())
orbit_class_types

['AMO', 'APO', 'ATE', 'IEO']

In [None]:
# Define a dataframe for orbit class types
ids = [str(uuid4()) for orbit_class_type in orbit_class_types]
names = ["Amor-class Asteroid", "Appollo-class Asteroid", "Aten-class Asteroid", "Inner Earth Object (Atiras)"]
descriptions = [
    "These asteroids have orbits that fall strictly outside the Earth's orbit.",
    "These asteroids have orbits that cross the Earth's orbit and have a semi-major axis greater than 1 AU.",
    "These asteroids have orbits that cross the Earth's orbit and have a semi-major axis less than 1 AU.",
    "These asteroids have orbits that fall strictly within the Earth's orbit."
]
colours= [
    "#4287F5", # BLUE,
    "#F54242", # RED,
    "#48F542", # GREEN,
    "#EFF542", # YELLOW,
]
data = {
    "_id": ids, # We use _id instead of id as mongodb databases use this key as the unique identifier for a document
    "abbreviation": orbit_class_types,
    "name": names,
    "description": descriptions,
    "colour": colours
}
orbit_class_type_df = pd.DataFrame(data=data)
orbit_class_type_df

Unnamed: 0,_id,abbreviation,name,description,colour
0,c0855c39-2ab1-4615-a430-7f8160a196c7,AMO,Amor-class Asteroid,These asteroids have orbits that fall strictly...,#4287F5
1,d9773d30-86d0-41b8-9a6f-40913b182d34,APO,Appollo-class Asteroid,These asteroids have orbits that cross the Ear...,#F54242
2,dd85c49c-49f8-4366-9f0d-4939400bdd8c,ATE,Aten-class Asteroid,These asteroids have orbits that cross the Ear...,#F54242
3,60f0d32c-cffc-4909-9057-6db3b7e241b0,IEO,Inner Earth Object (Atiras),These asteroids have orbits that fall strictly...,#EFF542


In [11]:
# Get orbit class id for each asteroid
df = df.merge(orbit_class_type_df[["_id", "abbreviation"]], how="inner", left_on="orbit_class_type", right_on="abbreviation")
df.drop(labels=["orbit_class_type", "abbreviation"], axis=1, inplace=True)
df.rename(columns={
    "_id": "orbit_class_type",
    "id": "_id" # We use _id instead of id as mongodb databases use this key as the unique identifier for a document
}, inplace=True)
df.head()

Unnamed: 0,_id,name,absolute_magnitude_h,estimated_diameter_min,estimated_diameter_max,is_potentially_hazardous,orbit_id,orbit_determination_date,first_observation_date,last_observation_date,semi_major_axis,inclination,orbit_class_type
0,2000433,433 Eros (A898 PA),10.41,22.0067027115,49.2084832235,False,659,2021-05-24 17:55:05,1893-10-29 00:00:00,2021-05-13 00:00:00,1.45815896084448,10.82830761253864,c0855c39-2ab1-4615-a430-7f8160a196c7
1,2000719,719 Albert (A911 TB),15.59,2.0256060086,4.529392731,False,270,2025-02-28 05:16:36,1911-10-04 00:00:00,2025-02-28 00:00:00,2.636355360832282,11.57399177654622,c0855c39-2ab1-4615-a430-7f8160a196c7
2,2000887,887 Alinda (A918 AA),13.8,4.6190746028,10.328564805,False,608,2025-03-01 05:16:49,1918-02-09 00:00:00,2025-03-01 00:00:00,2.473267631334386,9.399886845047668,c0855c39-2ab1-4615-a430-7f8160a196c7
3,2001036,1036 Ganymed (A924 UB),9.18,38.7752830381,86.70416872,False,1361,2025-03-01 05:16:50,1924-10-23 00:00:00,2025-02-28 00:00:00,2.665800727730635,26.68214802212515,c0855c39-2ab1-4615-a430-7f8160a196c7
4,2001221,1221 Amor (1932 EA1),17.37,0.8923905787,1.9954459964,False,143,2025-02-20 05:51:07,1932-03-12 00:00:00,2025-02-19 00:00:00,1.919760465125785,11.86862228590597,c0855c39-2ab1-4615-a430-7f8160a196c7


In [12]:
# Convert dataframes to json

# Create data directory
if not os.path.exists("./data"):
    os.mkdir("./data")


# Convert orbit class types dataframe to JSON
orbit_class_type_json_path = "./Data/orbit_class_types.json"
orbit_class_text =  "\n".join([str(orbit_class_type_df.loc[i, :].to_json()) for i in orbit_class_type_df.index])
f = open(orbit_class_type_json_path, "w")
f.write(orbit_class_text)
f.close()


# Convert asteroid dataframe to JSON
asteroid_json_path = "./Data/asteroids.json"
asteroid_text = "\n".join([str(df.loc[i, :].to_json()) for i in df.index])
f = open(asteroid_json_path, "w")
f.write(asteroid_text)
f.close()

Now you should have a complete dataset for the relevant known asteroids in the universe.