# Necessary imports

In [1]:
from IPython.display import clear_output

In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Scrapping tables from wiki

In [6]:
earth_analogs = pd.read_html('https://en.wikipedia.org/wiki/Earth_analog')
earth_analogs = earth_analogs[0]
print('Size:', earth_analogs.size)
earth_analogs = earth_analogs.rename(
    columns = {
      "Name": "Object",
      "Earth masses (ME)": "Mass (M⊕)",
      "Earth radii (R🜨)": "Radius (R⊕)"
      })
earth_analogs.head()

Size: 40


Unnamed: 0,Object,Mass (M⊕),Radius (R⊕),Note
0,Kepler-69c,≙2.14,1.7,Originally thought to be in the circumstellar ...
1,Kepler-9d,>1.5[27],1.64,Extremely hot.
2,CoRoT-7b,<9,1.58,Extremely hot.
3,Kepler-20f,< 14.3[25],1.03[25],"Slightly larger and likely more massive, far t..."
4,Tau Ceti g,>1.75,,Extremely hot. Not known to transit.


In [7]:
import pandas as pd
import numpy as np

potentially = pd.read_html('https://en.wikipedia.org/wiki/List_of_potentially_habitable_exoplanets')
potentially = potentially[1]
print('Size:', potentially.size)
potentially = potentially.rename(
    columns = {
      "Refs/Notes": "Note"
      })
potentially.head()

Size: 726


Unnamed: 0,Object,Star,Star type,Mass (M⊕),Radius (R⊕),Density (g/cm3),Flux (F⊕),Teq (K),Period (days),Distance (ly),Note
0,Earth (reported for reference),Sun,G2V,1.00,1.00,5.514,1.00,255.0,365.25,0,Only planet confirmed to support life.
1,Venus (reported for reference),Sun,G2V,0.815,0.950,5.243,1.911,244.261,224.7,0.0000042,[5]
2,Mars (reported for reference),Sun,G2V,0.107,0.533,3.934,0.431,209.8,686.98,0.0000058 - 0.000042,[6]
3,Gliese 12 b,Gliese 12,M4V,0.88+0.39 −0.26,1.03±0.11,4.44,1.6±0.2,315.0,12.7,40,[7]
4,Gliese 163 c,Gliese 163,M3V,≥6.80,—,—,1.25,277.0,25.6,49,[1]


# Scrapping table from NASA

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import requests

def load_data(nasa_url, local_path):
    """Load data taking in the TAP protocol URL for NASA and the local path
    """
    request_csv = requests.get(nasa_url)
    with open(local_path, 'w') as f:
        f.write(request_csv.text)
    data = pd.read_csv(local_path)
    return data


def parsec_to_ly(data):
    """Rename columns and convert distance from parsec to light years
    """
    data['Distance (ly)'] = data['sy_dist'] * 3.26
    data = data.drop(columns = 'sy_dist')
    return data

In [9]:
# Some variables
nasa_url = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query=select+pl_name,sy_dist,pl_rade,pl_masse,disc_year,discoverymethod+from+ps&format=csv"
local_path = "/content/exoplanet_data.csv"
n_clusters = 4
n_init = 40
kmeans_columns = ['Radius (R⊕)', 'Mass (M⊕)']

In [10]:
exoplanet_data = load_data(nasa_url, local_path)

In [11]:
exoplanet_data = exoplanet_data.rename(
    columns = {
      "pl_name": "Object",
      "pl_rade": "Radius (R⊕)",
      "pl_masse": "Mass (M⊕)",
      "disc_year": "Discovery Year",
      "discoverymethod": "Discovery Method"
      })

exoplanet_data = parsec_to_ly(exoplanet_data)
exoplanet_data = exoplanet_data.drop_duplicates(subset=['Object'])

In [12]:
print('Size:', exoplanet_data.size)

exoplanet_data.head()

Size: 35094


Unnamed: 0,Object,Radius (R⊕),Mass (M⊕),Discovery Year,Discovery Method,Distance (ly)
0,Kepler-6 b,13.38,,2009,Transit,1913.74714
18,Kepler-491 b,10.0,,2016,Transit,2056.3591
29,Kepler-257 b,2.74,,2014,Transit,2543.63456
40,Kepler-216 b,2.12,,2014,Transit,3871.1522
52,Kepler-32 c,2.37,,2011,Transit,1055.74122


In [13]:
exoplanet_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5849 entries, 0 to 38129
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Object            5849 non-null   object 
 1   Radius (R⊕)       3497 non-null   float64
 2   Mass (M⊕)         1032 non-null   float64
 3   Discovery Year    5849 non-null   int64  
 4   Discovery Method  5849 non-null   object 
 5   Distance (ly)     5722 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 319.9+ KB


In [14]:
print(list(exoplanet_data.columns))
print(list(earth_analogs.columns))
print(list(potentially.columns))

['Object', 'Radius (R⊕)', 'Mass (M⊕)', 'Discovery Year', 'Discovery Method', 'Distance (ly)']
['Object', 'Mass (M⊕)', 'Radius (R⊕)', 'Note']
['Object', 'Star', 'Star type', 'Mass (M⊕)', 'Radius (R⊕)', 'Density (g/cm3)', 'Flux (F⊕)', 'Teq (K)', 'Period (days)', 'Distance (ly)', 'Note']


In [15]:
exoplanet_data.isnull().sum()

Unnamed: 0,0
Object,0
Radius (R⊕),2352
Mass (M⊕),4817
Discovery Year,0
Discovery Method,0
Distance (ly),127


In [16]:
# Deleting data with unknown mass
exoplanet_data = exoplanet_data.dropna().reset_index()
exoplanet_data.size

3857

In [17]:
frames = [potentially, earth_analogs, exoplanet_data]

all_planets = pd.concat(frames)
print(list(all_planets.columns))
print('Size:', all_planets.size)
all_planets = all_planets.drop_duplicates(subset=['Object'])
all_planets.head()
# columns = ['Object', 'Star', 'Star type', 'Mass (M⊕)',	'Radius (R⊕)',	'Density (g/cm3)',	'Flux (F⊕)',	'Temperature (K)',	'Period (days)',	'Distance (ly)', "Discovery Year",  "Discovery Method"]


['Object', 'Star', 'Star type', 'Mass (M⊕)', 'Radius (R⊕)', 'Density (g/cm3)', 'Flux (F⊕)', 'Teq (K)', 'Period (days)', 'Distance (ly)', 'Note', 'index', 'Discovery Year', 'Discovery Method']
Size: 8778


Unnamed: 0,Object,Star,Star type,Mass (M⊕),Radius (R⊕),Density (g/cm3),Flux (F⊕),Teq (K),Period (days),Distance (ly),Note,index,Discovery Year,Discovery Method
0,Earth (reported for reference),Sun,G2V,1.00,1.00,5.514,1.00,255.0,365.25,0,Only planet confirmed to support life.,,,
1,Venus (reported for reference),Sun,G2V,0.815,0.950,5.243,1.911,244.261,224.7,0.0000042,[5],,,
2,Mars (reported for reference),Sun,G2V,0.107,0.533,3.934,0.431,209.8,686.98,0.0000058 - 0.000042,[6],,,
3,Gliese 12 b,Gliese 12,M4V,0.88+0.39 −0.26,1.03±0.11,4.44,1.6±0.2,315.0,12.7,40,[7],,,
4,Gliese 163 c,Gliese 163,M3V,≥6.80,—,—,1.25,277.0,25.6,49,[1],,,


# Preprocess

In [63]:
columns = ['Object', 'Star', 'Star type', 'Mass (M⊕)', 'Radius (R⊕)', 'Density (g/cm3)', 'Flux (F⊕)', 'Teq (K)', 'Period (days)', 'Distance (ly)', 'Note', 'Discovery Year', 'Discovery Method']
format_of_columns = ['str', 'str', 'str', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'str', 'str', 'str']

bad_chars=['+', '-', '±', '—', '≥', '~', '[', ']']
all_planets.head(5)

Unnamed: 0,Object,Star,Star type,Mass (M⊕),Radius (R⊕),Density (g/cm3),Flux (F⊕),Teq (K),Period (days),Distance (ly),Note,index,Discovery Year,Discovery Method
0,Earth (reported for reference),Sun,G2V,1.00,1.00,5.514,1.00,255,365.25,0,Only planet confirmed to support life.,,,
1,Venus (reported for reference),Sun,G2V,0.815,0.950,5.243,1.911,244.261,224.7,0.0000042,[5],,,
2,Mars (reported for reference),Sun,G2V,0.107,0.533,3.934,0.431,209.8,686.98,0.0000058 - 0.000042,[6],,,
3,Gliese 12 b,Gliese 12,M4V,0.88+0.39 −0.26,1.03±0.11,4.44,1.6±0.2,315,12.7,40,[7],,,
4,Gliese 163 c,Gliese 163,M3V,≥6.80,—,—,1.25,277,25.6,49,[1],,,
5,Gliese 180 d,Gliese 180,M2V,≥7.56,—,—,0.26,,106.3,39,[1],,,
6,Gliese 357 d,Gliese 357,M2V,≥6.10,—,—,0.38,200,55.7,31,,,,
7,Gliese 433 d,Gliese 433,M2V,≥5.22,—,—,1.06,,36.1,29.6,[1],,,
8,Gliese 514 b,Gliese 514,M1V,≥5.20,—,—,0.28+0.51 −0.166,202,140.4,25,Highly eccentric[8],,,
9,Gliese 555 b,Gliese 555,M4V,≥5.46,—,—,0.5,214 [9],36.2,20.4,Probably a mini-Neptune or an ocean planet[10],,,


In [97]:
import re

def preprocess_float(a):
  a=str(a)
  if a.find('+')!=-1:
    a=a[:a.find('+')]
  if a.find('-')!=-1:
    a=a[:a.find('-')]
  if a.find('±')!=-1:
    a=a[:a.find('±')]
  if a.find('≥')!=-1:
    a=a[a.find('≥')+1:]
  if a.find('>')!=-1:
    a=a[a.find('>')+1:]
  if a.find('<')!=-1:
    a=a[a.find('<')+1:]
  if a.find('≙')!=-1:
    a=a[a.find('≙')+1:]
  if a.find('~')!=-1:
    a=a[a.find('~')+1:]
  if a.find('—')!=-1:
    a='nan'
  if a.find('[')!=-1:
    a=a[:a.find('[')]
  cleaned_value = re.sub(r"[^0-9.-]", "", a)
  if cleaned_value =='':
    cleaned_value = 'nan'
  return round(float(cleaned_value.strip()), 2)

def preprocess_int(a):
  a=str(a)
  if a == 'NaN' or a=='nan' or a=='None':
    a='0'
  cleaned_value = re.sub(r"[^0-9.-]", "", a)
  return int(float(cleaned_value))

In [81]:
columns_float=['Mass (M⊕)', 'Radius (R⊕)', 'Density (g/cm3)', 'Flux (F⊕)', 'Teq (K)', 'Period (days)', 'Distance (ly)']
column_int=['Discovery Year']

In [98]:
for i in columns_float:
  all_planets[i] = all_planets[i].apply(lambda x: preprocess_float(x))

for i in column_int:
  all_planets[i] = all_planets[i].apply(lambda x: preprocess_int(x))
all_planets.head()

Unnamed: 0,Object,Star,Star type,Mass (M⊕),Radius (R⊕),Density (g/cm3),Flux (F⊕),Teq (K),Period (days),Distance (ly),Note,index,Discovery Year,Discovery Method
0,Earth (reported for reference),Sun,G2V,1.0,1.0,5.51,1.0,255.0,365.25,0.0,Only planet confirmed to support life.,,0,
1,Venus (reported for reference),Sun,G2V,0.81,0.95,5.24,1.91,244.26,224.7,4.2,[5],,0,
2,Mars (reported for reference),Sun,G2V,0.11,0.53,3.93,0.43,209.8,686.98,5.8,[6],,0,
3,Gliese 12 b,Gliese 12,M4V,0.88,1.03,4.44,1.6,315.0,12.7,40.0,[7],,0,
4,Gliese 163 c,Gliese 163,M3V,6.8,,,1.25,277.0,25.6,49.0,[1],,0,


In [100]:
all_planets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 622 entries, 0 to 550
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Object            622 non-null    object 
 1   Star              66 non-null     object 
 2   Star type         66 non-null     object 
 3   Mass (M⊕)         603 non-null    float64
 4   Radius (R⊕)       608 non-null    float64
 5   Density (g/cm3)   9 non-null      float64
 6   Flux (F⊕)         63 non-null     float64
 7   Teq (K)           55 non-null     float64
 8   Period (days)     66 non-null     float64
 9   Distance (ly)     614 non-null    float64
 10  Note              65 non-null     object 
 11  index             548 non-null    float64
 12  Discovery Year    622 non-null    int64  
 13  Discovery Method  548 non-null    object 
dtypes: float64(8), int64(1), object(5)
memory usage: 89.1+ KB


In [101]:
all_planets.to_csv('DVW_project.csv', index=False)