# IBM Applied Data Science Capstone
## Part 1: Data Collection

**Objective:** Collect and load automotive sales data for analysis

**Author:** Son Nguyen

---


## 1. Import Required Libraries

We'll use pandas for data manipulation, requests to fetch data from SpaceX API, and other libraries for data processing.


In [1]:
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print("=" * 60)


Libraries imported successfully!


## 2. Fetch SpaceX Launch Data from API

We'll collect real SpaceX launch data from the public SpaceX API. This includes:
- Launch history with success/failure outcomes
- Rocket specifications (Falcon 1, Falcon 9, Falcon Heavy)
- Launchpad locations (Cape Canaveral, Vandenberg)
- Payload information
- First stage landing outcomes


In [2]:
# Fetch SpaceX launches data
print("Fetching SpaceX launch data from API...")
print("-" * 60)

try:
    url = "https://api.spacexdata.com/v4/launches"
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    launches = response.json()
    print(f"✓ Successfully fetched {len(launches)} launch records from SpaceX API")
except Exception as e:
    print(f"✗ API fetch failed: {e}")
    print("Will use fallback dataset...")
    launches = []

# Fetch rocket data for additional information
try:
    rocket_url = "https://api.spacexdata.com/v4/rockets"
    rockets_response = requests.get(rocket_url, timeout=30)
    rockets_data = rockets_response.json()
    rocket_lookup = {rocket['id']: rocket for rocket in rockets_data}
    print(f"✓ Fetched {len(rocket_lookup)} rocket specifications")
except Exception as e:
    print(f"✗ Rocket data fetch failed: {e}")
    rocket_lookup = {}

# Fetch launchpad data
try:
    launchpad_url = "https://api.spacexdata.com/v4/launchpads"
    launchpads_response = requests.get(launchpad_url, timeout=30)
    launchpads_data = launchpads_response.json()
    launchpad_lookup = {lp['id']: lp for lp in launchpads_data}
    print(f"✓ Fetched {len(launchpad_lookup)} launchpad locations")
except Exception as e:
    print(f"✗ Launchpad data fetch failed: {e}")
    launchpad_lookup = {}


Fetching SpaceX launch data from API...
------------------------------------------------------------


✓ Successfully fetched 205 launch records from SpaceX API


✓ Fetched 4 rocket specifications


✓ Fetched 6 launchpad locations


## 3. Process and Structure Data

Now we'll process the API data to create a structured dataset suitable for analysis.


In [3]:
# Process SpaceX launch data into structured format
print("\nProcessing SpaceX launch data...")
print("-" * 60)

data = []

for launch in launches:
    # Skip upcoming launches
    if launch.get('upcoming', False):
        continue
    
    # Basic info
    flight_number = launch.get('flight_number', 0)
    name = launch.get('name', '')
    date_utc = launch.get('date_utc', '')
    success = launch.get('success', None)
    
    # Date parsing
    try:
        date_obj = pd.to_datetime(date_utc)
        year = date_obj.year
        month = date_obj.month
        quarter = (month - 1) // 3 + 1
    except:
        year = None
        month = None
        quarter = None
    
    # Rocket info
    rocket_id = launch.get('rockets', [None])[0] if launch.get('rockets') else None
    rocket_name = 'Unknown'
    rocket_type = 'Unknown'
    cost_per_launch = 0
    
    if rocket_id and rocket_id in rocket_lookup:
        rocket = rocket_lookup[rocket_id]
        rocket_name = rocket.get('name', 'Unknown')
        rocket_type = rocket.get('type', 'Unknown')
        cost_per_launch = rocket.get('cost_per_launch', 0) if rocket.get('cost_per_launch') else 0
    
    # Launchpad info
    launchpad_id = launch.get('launchpad', None)
    launchpad_name = 'Unknown'
    location_name = 'Unknown'
    latitude = None
    longitude = None
    region = 'Unknown'
    
    if launchpad_id and launchpad_id in launchpad_lookup:
        lp = launchpad_lookup[launchpad_id]
        launchpad_name = lp.get('name', 'Unknown')
        location_name = lp.get('locality', 'Unknown')
        latitude = lp.get('latitude', None)
        longitude = lp.get('longitude', None)
        region = lp.get('region', 'Unknown')
    
    # Core landing info (First Stage Landing)
    cores = launch.get('cores', [])
    core_landing = 'Unknown'
    core_reused = False
    
    if cores:
        core = cores[0]
        landing_attempt = core.get('landing_attempt', False)
        landing_success = core.get('landing_success', None)
        core_reused = core.get('reused', False)
        
        if landing_attempt:
            if landing_success:
                core_landing = 'Success'
            else:
                core_landing = 'Failed'
        else:
            core_landing = 'No Attempt'
    
    # Success rate
    success_rate = 1.0 if success is True else (0.0 if success is False else None)
    
    # Payload info
    payloads = launch.get('payloads', [])
    payload_count = len(payloads)
    total_payload_mass = 0
    payload_type = 'Unknown'
    
    # Try to get payload details (limited to avoid too many API calls)
    if payloads and len(payloads) > 0:
        try:
            payload_id = payloads[0]
            payload_url = f"https://api.spacexdata.com/v4/payloads/{payload_id}"
            payload_response = requests.get(payload_url, timeout=10)
            if payload_response.status_code == 200:
                payload_data = payload_response.json()
                mass_kg = payload_data.get('mass_kg', 0)
                if mass_kg:
                    total_payload_mass = mass_kg
                payload_type = payload_data.get('type', 'Unknown')
        except:
            pass
    
    data.append({
        'Flight_Number': flight_number,
        'Launch_Name': name,
        'Date_UTC': date_utc,
        'Year': year,
        'Month': month,
        'Quarter': quarter,
        'Success': success,
        'Success_Rate': success_rate,
        'Rocket_Name': rocket_name,
        'Rocket_Type': rocket_type,
        'Cost_Per_Launch': cost_per_launch,
        'Launchpad_Name': launchpad_name,
        'Location': location_name,
        'Region': region,
        'Latitude': latitude,
        'Longitude': longitude,
        'Payload_Count': payload_count,
        'Payload_Mass_kg': total_payload_mass,
        'Payload_Type': payload_type,
        'Core_Landing': core_landing,  # First Stage Landing Outcome
        'Core_Reused': core_reused,
    })

df = pd.DataFrame(data)

# Fill missing values
df['Year'] = df['Year'].fillna(df['Year'].median())
df['Month'] = df['Month'].fillna(df['Month'].median())
df['Success_Rate'] = df['Success_Rate'].fillna(df['Success_Rate'].mean())

# Add derived features
df['Year_Category'] = pd.cut(
    df['Year'],
    bins=[2000, 2010, 2015, 2020, 2025],
    labels=['Early', 'Mid', 'Recent', 'Current']
)

print(f"✓ Processed {len(df)} launch records")
print(f"✓ Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())



Processing SpaceX launch data...
------------------------------------------------------------


✓ Processed 187 launch records
✓ Dataset shape: (187, 22)

First few rows:
   Flight_Number  Launch_Name                  Date_UTC  Year  Month  Quarter  \
0              1    FalconSat  2006-03-24T22:30:00.000Z  2006      3        1   
1              2      DemoSat  2007-03-21T01:10:00.000Z  2007      3        1   
2              3  Trailblazer  2008-08-03T03:34:00.000Z  2008      8        3   
3              4       RatSat  2008-09-28T23:15:00.000Z  2008      9        3   
4              5     RazakSat  2009-07-13T03:35:00.000Z  2009      7        3   

  Success  Success_Rate Rocket_Name Rocket_Type  ...       Location  \
0   False           0.0     Unknown     Unknown  ...  Omelek Island   
1   False           0.0     Unknown     Unknown  ...  Omelek Island   
2   False           0.0     Unknown     Unknown  ...  Omelek Island   
3    True           1.0     Unknown     Unknown  ...  Omelek Island   
4    True           1.0     Unknown     Unknown  ...  Omelek Island   

           

In [4]:
# Data quality check
print("\n" + "=" * 60)
print("DATA QUALITY CHECK")
print("=" * 60)

print(f"\nDataset Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "✓ No missing values!")
print(f"\nBasic Statistics:")
print(df.describe())
print(f"\nUnique Values:")
print(f"Rocket Names: {df['Rocket_Name'].unique()}")
print(f"Regions: {df['Region'].unique()}")
print(f"Core Landing Outcomes: {df['Core_Landing'].unique()}")



DATA QUALITY CHECK

Dataset Shape: (187, 22)
Columns: ['Flight_Number', 'Launch_Name', 'Date_UTC', 'Year', 'Month', 'Quarter', 'Success', 'Success_Rate', 'Rocket_Name', 'Rocket_Type', 'Cost_Per_Launch', 'Launchpad_Name', 'Location', 'Region', 'Latitude', 'Longitude', 'Payload_Count', 'Payload_Mass_kg', 'Payload_Type', 'Core_Landing', 'Core_Reused', 'Year_Category']

Data Types:
Flight_Number         int64
Launch_Name          object
Date_UTC             object
Year                  int64
Month                 int64
Quarter               int64
Success              object
Success_Rate        float64
Rocket_Name          object
Rocket_Type          object
Cost_Per_Launch       int64
Launchpad_Name       object
Location             object
Region               object
Latitude            float64
Longitude           float64
Payload_Count         int64
Payload_Mass_kg     float64
Payload_Type         object
Core_Landing         object
Core_Reused            bool
Year_Category      category
dt

In [5]:
# Save dataset
output_path = '../data/spacex_launches.csv'
df.to_csv(output_path, index=False)

print(f"\n✓ Dataset saved successfully!")
print(f"✓ Location: {output_path}")
print(f"✓ Total records: {len(df)}")
print(f"✓ Time period: {df['Year'].min()} - {df['Year'].max()}")
print(f"\nDataset is ready for exploratory data analysis!")



✓ Dataset saved successfully!
✓ Location: ../data/spacex_launches.csv
✓ Total records: 187
✓ Time period: 2006 - 2022

Dataset is ready for exploratory data analysis!
