In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(r"F:\Debye\Planetary-system-analysis\Data\PS_2025.10.22_09.50.39.csv", comment="#")
data.head()

In [None]:
# Summary of dataset
data.info()
# Check missing values
data.isnull().sum()
# Quick statistics
data.describe()


In [None]:
columns = [
    'pl_name', 'hostname', 'pl_rade', 'pl_bmasse', 
    'pl_orbper', 'st_teff', 'st_rad', 'discoverymethod', 'sy_dist'
]
data = data[[col for col in columns if col in data.columns]]
data.head()

In [None]:
# Remove rows missing both radius and mass
data = data.dropna(subset=['pl_rade', 'pl_bmasse'], how='all')

# Fill missing temperature with mean
if 'st_teff' in data.columns:
    data['st_teff'] = data['st_teff'].fillna(data['st_teff'].mean())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter: Planet Radius vs Mass
sns.scatterplot(x='pl_rade', y='pl_bmasse', data=data)
plt.title('Planet Radius vs Planet Mass')
plt.xlabel('Radius (Earth Radii)')
plt.ylabel('Mass (Earth Masses)')
plt.show()

# Bar plot: Discovery methods
sns.countplot(y='discoverymethod', data=data)
plt.title('Number of Planets by Discovery Method')
plt.show()


In [None]:
# Planets by discovery method
print(data['discoverymethod'].value_counts())

# Correlation between radius and mass
print(data[['pl_rade','pl_bmasse']].corr())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Example label (Earth-like if radius < 1.5 and mass < 5)
data['earth_like'] = ((data['pl_rade'] < 1.5) & (data['pl_bmasse'] < 5)).astype(int)

# Select features and labels
X = data[['pl_rade', 'pl_bmasse', 'st_teff']].dropna()
y = data.loc[X.index, 'earth_like']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Accuracy
print("Accuracy:", clf.score(X_test, y_test))
