In [None]:
# Install required packages
!pip install pandas numpy requests beautifulsoup4 kaggle

# Import necessary libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
import json
import time
from datetime import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
missions_df = pd.read_csv('/Users/naelamacbookair/desktop backup/self projects/space_mission_analysis/data/Space_Corrected.csv')

# Display basic information about the dataset
print(f"Dataset shape: {missions_df.shape}")
missions_df.info()

# Display first few rows
missions_df.head()

In [None]:
# Check for missing values
missing_values = missions_df.isnull().sum()
print("Missing values per column:")
for col in missions_df.columns:
    nulls = missions_df[col].isnull().sum()
    if nulls > 0:
        print(f"{col}: {nulls} ({nulls/len(missions_df)*100:.2f}%)")

In [None]:
# Using NASA's API to get additional information
# Note: This requires a NASA API key - get one at https://api.nasa.gov/
NASA_API_KEY = "YOUR_NASA_KEY_HERE"  # Replace with your actual key

def get_nasa_mission_info(mission_name):
    """Fetch mission information from NASA API."""
    base_url = "https://api.nasa.gov/planetary/apod"
    params = {
        "api_key": NASA_API_KEY,
        "date": "today"  # This is just an example; NASA has other endpoints for mission data
    }
    
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error: {response.status_code}")
            return None
    except Exception as e:
        print(f"Exception occurred: {e}")
        return None

# Example usage
# mission_info = get_nasa_mission_info("Apollo 11")
# print(mission_info)

In [None]:
def scrape_launch_sites():
    """Scrape information about major launch sites."""
    url = "https://en.wikipedia.org/wiki/Spaceport"
    
    response = requests.get(url)
    launch_sites = []
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Example: Find tables with launch site information
        tables = soup.find_all('table', {'class': 'wikitable'})
        
        if tables:
            for table in tables:
                rows = table.find_all('tr')
                
                # Skip header row
                for row in rows[1:]:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 3:  # Ensuring we have enough cells for data
                        site_data = {
                            'Name': cells[0].text.strip(),
                            'Location': cells[1].text.strip(),
                            'Coordinates': cells[2].text.strip() if len(cells) > 2 else "Unknown"
                        }
                        launch_sites.append(site_data)
        
        return pd.DataFrame(launch_sites)
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        return None

# Launch sites information
# launch_sites_df = scrape_launch_sites()
# launch_sites_df.head()

In [None]:
def get_space_track_data(username, password):
    """
    Get data from Space-Track.org API
    Requires registration at https://www.space-track.org
    """
    login_url = 'https://www.space-track.org/ajaxauth/login'
    query_url = 'https://www.space-track.org/basicspacedata/query/class/satcat/format/json'
    
    payload = {
        'identity': username,
        'password': password
    }
    
    with requests.Session() as session:
        # Login
        resp = session.post(login_url, data=payload)
        
        if resp.status_code == 200:
            # Query data
            resp = session.get(query_url)
            
            if resp.status_code == 200:
                return pd.DataFrame(resp.json())
            else:
                print(f"Failed to retrieve data: {resp.status_code}")
                return None
        else:
            print(f"Login failed: {resp.status_code}")
            return None

# Example usage
# space_track_df = get_space_track_data("your_username", "your_password")
# if space_track_df is not None:
#     space_track_df.head()

In [None]:
# Example of how we might merge datasets - just a placeholder as we don't have all data yet
# combined_df = pd.merge(missions_df, launch_sites_df, left_on='Launch Site', right_on='Name', how='left')
# combined_df.head()

In [None]:
# Create a directory for processed data if it doesn't exist
!mkdir -p ./data/processed

# Save the main dataset
missions_df.to_csv('./data/processed/missions_cleaned.csv', index=False)

# Example: Save additional datasets if they were collected
# launch_sites_df.to_csv('./data/processed/launch_sites.csv', index=False)
# if space_track_df is not None:
#     space_track_df.to_csv('./data/processed/satellite_data.csv', index=False)

In [None]:
print("Data Collection Summary:")
print(f"- Main space missions dataset: {len(missions_df)} records")
# print(f"- Launch sites information: {len(launch_sites_df)} sites")
# if space_track_df is not None:
#     print(f"- Space-Track satellite data: {len(space_track_df)} satellites")
# else:
#     print("- Space-Track satellite data: Not collected")

print("\nNext steps:")
print("1. Clean and preprocess the collected data")
print("2. Perform exploratory data analysis")
print("3. Develop predictive models")