In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns


In [2]:
# Load dataset
df = pd.read_csv('expoplanet.csv')

In [3]:
print(df.shape)
print(df.isnull().sum())
print(df.columns)
print(df.info())


(5250, 13)
name                   0
distance              17
stellar_magnitude    161
planet_type            0
discovery_year         0
mass_multiplier       23
mass_wrt              23
radius_multiplier     17
radius_wrt            17
orbital_radius       289
orbital_period         0
eccentricity           0
detection_method       0
dtype: int64
Index(['name', 'distance', 'stellar_magnitude', 'planet_type',
       'discovery_year', 'mass_multiplier', 'mass_wrt', 'radius_multiplier',
       'radius_wrt', 'orbital_radius', 'orbital_period', 'eccentricity',
       'detection_method'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5250 entries, 0 to 5249
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   name               5250 non-null   object 
 1   distance           5233 non-null   float64
 2   stellar_magnitude  5089 non-null   float64
 3   planet_type        5250 non-null   o

In [4]:
# Drop constant or low-variance columns
df = df.drop(columns=['mass_wrt', 'radius_wrt'], errors='ignore')

In [5]:
# Show missing values before filling
print("Missing values before filling:")
print(df.isnull().sum()[df.isnull().sum() > 0])


Missing values before filling:
distance              17
stellar_magnitude    161
mass_multiplier       23
radius_multiplier     17
orbital_radius       289
dtype: int64


In [6]:
# Fill missing numeric values with median
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

In [7]:
# Encode categorical columns
label_encoders = {}
categorical_cols = df.select_dtypes(include=['object']).columns

In [8]:
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)  # Ensure it's string
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for inverse transform if needed

In [9]:

# Check for remaining missing values
missing_count = df.isnull().sum().sum()
print(f"Missing values after cleaning: {missing_count}")

Missing values after cleaning: 0


In [10]:
# Preview cleaned data
print(df.head())
df.to_csv('Cleaned_Exoplanet_Data.csv', index=False)

   name  distance  stellar_magnitude  planet_type  discovery_year  \
0     0     304.0            4.72307            0            2007   
1     1     409.0            5.01300            0            2009   
2     2     246.0            5.23133            0            2008   
3     3      58.0            6.61935            0            2002   
4     4      69.0            6.21500            0            1996   

   mass_multiplier  radius_multiplier  orbital_radius  orbital_period  \
0         19.40000               1.08        1.290000        0.892539   
1         14.74000               1.09        1.530000        1.400000   
2          4.80000               1.15        0.830000        0.508693   
3          8.13881               1.12        2.773069        4.800000   
4          1.78000               1.20        1.660000        2.200000   

   eccentricity  detection_method  
0          0.23                 8  
1          0.08                 8  
2          0.00                 8  
3 