In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Step 01: Loading the UCS Satellite Dataset

In [10]:
file_path= "../data/text-data/ucs_satellites.txt"

df= pd.read_csv(file_path, sep="\t", encoding="latin1")

print("Shape of the dataset: ", df.shape)
print("\nColumn names: ", df.columns)
print("\nFirst 5 rows: ", df.head())
print("\nData types: ", df.dtypes)


Shape of the dataset:  (7562, 67)

Column names:  Index(['Current Official Name of Satellite', 'Country/Org of UN Registry',
       'Country of Operator/Owner', 'Operator/Owner', 'Users', 'Purpose',
       'Detailed Purpose', 'Class of Orbit', 'Type of Orbit',
       'Longitude of GEO (degrees)', 'Perigee (km)', 'Apogee (km)',
       'Eccentricity', 'Inclination (degrees)', 'Period (minutes)',
       'Launch Mass (kg.)', ' Dry Mass (kg.) ', 'Power (watts)',
       'Date of Launch', 'Expected Lifetime (yrs.)', 'Contractor',
       'Country of Contractor', 'Launch Site', 'Launch Vehicle',
       'COSPAR Number', 'NORAD Number', 'Comments', 'Unnamed: 27',
       'Source Used for Orbital Data', 'Source', 'Source.1', 'Source.2',
       'Source.3', 'Source.4', 'Source.5', 'Source.6', 'Unnamed: 36',
       'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40',
       'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44',
       'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 

# Step 02: Cleaning Dataset
- Removing empty or irrelevant columns (like "Unnamed: XX").
- Standardizing column names (strip spaces, lowercase, replace spaces with underscores).

In [11]:
df= df.loc[:, ~df.columns.str.contains('^Unnamed')]

df.columns= (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(r'[^\w]', '', regex=True)
)

print("Shape after cleaning: ", df.shape)
print("\nColumn names after cleaning: \n", df.columns.tolist()[:20])


Shape after cleaning:  (7562, 35)

Column names after cleaning: 
 ['current_official_name_of_satellite', 'countryorg_of_un_registry', 'country_of_operatorowner', 'operatorowner', 'users', 'purpose', 'detailed_purpose', 'class_of_orbit', 'type_of_orbit', 'longitude_of_geo_degrees', 'perigee_km', 'apogee_km', 'eccentricity', 'inclination_degrees', 'period_minutes', 'launch_mass_kg', 'dry_mass_kg', 'power_watts', 'date_of_launch', 'expected_lifetime_yrs']


# Step 03: Handling Missing Values and Selecting Core Columns
- Identifying and handling missing values (e.g., filling with default values, removing rows/columns).
- Selecting only the core columns that are essential for analysis (e.g., "satellite_name", "launch_date", "country", "payload_mass_kg", "orbit_type").