# Import libraires

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import os
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
file_path = "../data/raw/Space_Corrected.csv"

df = pd.read_csv(file_path).drop(columns=["Unnamed: 0", "Unnamed: 0.1"])

display(df.sample(5))
print(df.info())
display(df.describe())

Unnamed: 0,Company Name,Location,Datum,Detail,Status Rocket,Rocket,Status Mission
3689,RVSN USSR,"Site 41/1, Plesetsk Cosmodrome, Russia","Thu Mar 14, 1968 09:34 UTC",Vostok-2M | Cosmos 206,StatusRetired,,Success
1030,Lockheed,"SLC-40, Cape Canaveral AFS, Florida, USA","Tue Sep 09, 2003 04:29 UTC",Titan IV(401)B | NROL-19 (Mentor),StatusRetired,,Success
1473,Lockheed,"SLC-36A, Cape Canaveral AFS, Florida, USA","Fri Dec 15, 1995 00:23 UTC",Atlas IIA | Galaxy 3R,StatusRetired,,Success
2787,RVSN USSR,"Site 133/3, Plesetsk Cosmodrome, Russia","Thu Aug 26, 1976 11:00 UTC",Cosmos-2I (63SM) | Cosmos 850,StatusRetired,,Success
3530,RVSN USSR,"Site 31/6, Baikonur Cosmodrome, Kazakhstan","Thu Sep 18, 1969 08:40 UTC",Voskhod | Cosmos 299,StatusRetired,,Success


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4324 entries, 0 to 4323
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Company Name    4324 non-null   object
 1   Location        4324 non-null   object
 2   Datum           4324 non-null   object
 3   Detail          4324 non-null   object
 4   Status Rocket   4324 non-null   object
 5    Rocket         964 non-null    object
 6   Status Mission  4324 non-null   object
dtypes: object(7)
memory usage: 236.6+ KB
None


Unnamed: 0,Company Name,Location,Datum,Detail,Status Rocket,Rocket,Status Mission
count,4324,4324,4324,4324,4324,964.0,4324
unique,56,137,4319,4278,2,56.0,4
top,RVSN USSR,"Site 31/6, Baikonur Cosmodrome, Kazakhstan","Wed Nov 05, 2008 00:15 UTC",Cosmos-3MRB (65MRB) | BOR-5 Shuttle,StatusRetired,450.0,Success
freq,1777,235,2,6,3534,136.0,3879


# Inspect Data

## Different time formats in Datum column

In [3]:
print(df.iloc[106])
print("-" * 100)
print(df.iloc[1597])

Company Name                                                    ISA
Location          Imam Khomeini Spaceport, Semnan Space Center, ...
Datum                                              Thu Aug 29, 2019
Detail                                          Safir-1B+ | Nahid-1
Status Rocket                                          StatusActive
 Rocket                                                         NaN
Status Mission                                    Prelaunch Failure
Name: 106, dtype: object
----------------------------------------------------------------------------------------------------
Company Name                                         Boeing
Location          SLC-17A, Cape Canaveral AFS, Florida, USA
Datum                            Wed Dec 08, 1993 00:48 UTC
Detail                              Delta II 7925 | NATO 4B
Status Rocket                                 StatusRetired
 Rocket                                                 NaN
Status Mission                        

2 different formats in the Datum column, a way to convert it into datetime type is to detect which format is the Datum.

In [4]:
def parse_mixed_date(x):
    if pd.isna(x):
        return pd.NaT
    x = str(x)
    try:
        # Format avec heure et UTC
        return pd.to_datetime(x, format="%a %b %d, %Y %H:%M %Z", errors="raise").tz_localize(None)
    except:
        # Format sans heure → 00:00:00
        return pd.to_datetime(x, format="%a %b %d, %Y", errors="coerce")

df["Datum"] = df["Datum"].apply(parse_mixed_date)


In [5]:
print(df.iloc[106])
print("-" * 100)
print(df.iloc[1597])

Company Name                                                    ISA
Location          Imam Khomeini Spaceport, Semnan Space Center, ...
Datum                                           2019-08-29 00:00:00
Detail                                          Safir-1B+ | Nahid-1
Status Rocket                                          StatusActive
 Rocket                                                         NaN
Status Mission                                    Prelaunch Failure
Name: 106, dtype: object
----------------------------------------------------------------------------------------------------
Company Name                                         Boeing
Location          SLC-17A, Cape Canaveral AFS, Florida, USA
Datum                                   1993-12-08 00:48:00
Detail                              Delta II 7925 | NATO 4B
Status Rocket                                 StatusRetired
 Rocket                                                 NaN
Status Mission                        

## 'Status Mission' class distribution

In [6]:
df['Status Mission'].value_counts()

Status Mission
Success              3879
Failure               339
Partial Failure       102
Prelaunch Failure       4
Name: count, dtype: int64

Imbalanced classes > using imbalanced techniques for this project

## Handling missing values

In [7]:
# Clean Rocket column
df[' Rocket'] = pd.to_numeric(df[' Rocket'], errors='coerce')

# Drop missing values for better visualization
df_plot = df.dropna(subset=[' Rocket'])

# Plot histogram
fig = px.histogram(
    df_plot, 
    x=' Rocket', 
    nbins=50, 
    title='Distribution of Rocket Costs (Skewed)',
    labels={'Rocket': 'Rocket Cost ($ million)'},
    opacity=0.8
)

# Add mean and median lines
mean_val = df_plot[' Rocket'].mean()
median_val = df_plot[' Rocket'].median()

fig.add_vline(x=mean_val, line_dash="dash", line_color="red", annotation_text="Mean", annotation_position="top left")
fig.add_vline(x=median_val, line_dash="dash", line_color="green", annotation_text="Median", annotation_position="top right")

fig.show()


Since we can see through this histogram that most rockets costs are low, right-skewed distribution > median imputation is justified (robust to outliers) as an easy missing values handling.

In [8]:
df[' Rocket'].fillna(df[' Rocket'].median(), inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4324 entries, 0 to 4323
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Company Name    4324 non-null   object        
 1   Location        4324 non-null   object        
 2   Datum           4324 non-null   datetime64[ns]
 3   Detail          4324 non-null   object        
 4   Status Rocket   4324 non-null   object        
 5    Rocket         4324 non-null   float64       
 6   Status Mission  4324 non-null   object        
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 236.6+ KB


# Exploratory Data Analysis

## Mission outcomes distribution

In [9]:
fig = px.histogram(
    df, 
    x='Status Mission', 
    title='Mission Outcomes Distribution',
    text_auto=True,
    color='Status Mission'
)
fig.show()


## Number of Missions per Decade

In [10]:
df['Datum'] = pd.to_datetime(df['Datum'], errors='coerce')
df['Year'] = df['Datum'].dt.year
df['Decade'] = (df['Year'] // 10) * 10

fig = px.histogram(
    df, 
    x='Decade', 
    title='Number of Missions per Decade',
    labels={'Decade': 'Launch Decade'},
    color='Status Mission'
)
fig.show()


## Rocket Cost by Mission Outcome

In [11]:
fig = px.box(
    df, 
    x='Status Mission', 
    y=' Rocket', 
    title='Rocket Cost by Mission Outcome',
    log_y=True
)
fig.show()


Due to the numerous outliers in each category, we cannot clearly determine if there are significant cost differences between successful and failed missions.

## Top launch locations

In [12]:
location_counts = df['Location'].value_counts().reset_index()
location_counts.columns = ['Location', 'Count']
location_counts = location_counts.sort_values(by='Count', ascending=True)

fig = px.bar(
    location_counts,
    x='Count',
    y='Location',
    title='Top Launch Locations',
    text='Count',
)
fig.show()

# Export cleaned data

In [14]:
folder_path = "../data/processed/"
os.makedirs(folder_path, exist_ok=True)

df.to_csv(os.path.join(folder_path, 'dataset_space_mission.csv'), index=False)
