In [2]:
import re

from pandas import *
from typing import Callable, Dict

df = read_csv("meteorites.csv")

def isCC(type: str):
  return bool(re.findall("^[C][I|M|R|O|V|K][0-9|~]+", type))

def isOC(type: str):
  isLOrLLGroup = bool(re.findall("^[L][L]?[0-9|~]*", type))
  isHGroup = bool(re.findall("^[H][0-9|~]*", type))
  return isLOrLLGroup or isHGroup

def isEC(type: str):
  isEHGroup = bool(re.findall("^[E][H]?[0-9]+", type))
  isELGroup = bool(re.findall("^[E][L]?[0-9]+", type))
  return isEHGroup or isELGroup

def isKC(type: str):
  return "K3" in type

def isPA(type: str):
  minerals = [
    "Acapulcoite",
    "Brachinite",
    "Winonaite"
  ]
  return bool([mineral for mineral in minerals if mineral in type])

def isAC(type: str):
  minerals = [
    "Ureilite",
    "Diogenite",
    "Eucrite",
    "Martian",
    "Aubrite",
    "Angrite"
  ]
  return bool([mineral for mineral in minerals if mineral in type])

def isSI(type: str):
  minerals = [
    "Mesosiderite",
    "Stone-uncl",
    "Pallasite"
  ]
  return bool([mineral for mineral in minerals if mineral in type])

def isIron(type: str):
  return "Iron, " in type 

In [3]:

predicates: Dict[str, Callable[[str], bool]] = {
  "Carbonaceous chondrite": isCC,
  "Enstatite chondrite": isEC,
  "Ordinary chondrite": isOC,
  "Kakangari chondrite": isKC,
  "Primitive achondrite": isPA,
  "Achondrite": isAC,
  "Stony-iron": isSI,
  "Iron": isIron,
  "Unknown": lambda _: True,
}

df_c = df.copy()
df_c['category'] = df_c.apply(lambda row: [category_name for (category_name, category_predicate) in predicates.items() if category_predicate(row['class']) or category_name in row['class']][0], axis=1)

for k,v in df_c['category'].value_counts().to_dict().items():
  print(k, v)

Ordinary chondrite 40203
Achondrite 1547
Carbonaceous chondrite 1477
Iron 1069
Enstatite chondrite 509
Unknown 457
Stony-iron 331
Primitive achondrite 121
Kakangari chondrite 2


In [4]:
uncategorised = df[df_c['category'] == 'FUCK KNOWS']['class']
uncategorised.value_counts()

Series([], Name: class, dtype: int64)

In [5]:
df[df_c['category'] == 'Iron']

Unnamed: 0,name,id,name_type,class,mass,fall,year,lat,long,geolocation
16,Akyumak,433,Valid,"Iron, IVA",50000.0,Fell,1981.0,39.91667,42.81667,"(39.91667, 42.81667)"
61,Avce,4906,Valid,"Iron, IIAB",1230.0,Fell,1908.0,46.00000,13.50000,"(46.0, 13.5)"
66,Bahjoi,4922,Valid,"Iron, IAB-sLL",10322.0,Fell,1934.0,28.48333,78.50000,"(28.48333, 78.5)"
70,Ban Rong Du,4934,Valid,"Iron, ungrouped",16700.0,Fell,1993.0,16.66667,101.18333,"(16.66667, 101.18333)"
129,Bogou,5097,Valid,"Iron, IAB-MG",8800.0,Fell,1962.0,12.50000,0.70000,"(12.5, 0.7)"
...,...,...,...,...,...,...,...,...,...,...
45703,Zenda,30400,Valid,"Iron, IAB complex",3700.0,Found,1955.0,42.51333,-88.48944,"(42.51333, -88.48944)"
45705,Zerhamra,30403,Valid,"Iron, IIIAB-an",630000.0,Found,1967.0,29.85861,-2.64500,"(29.85861, -2.645)"
45707,Zhaoping,54609,Valid,"Iron, IAB complex",2000000.0,Found,1983.0,24.23333,111.18333,"(24.23333, 111.18333)"
45708,Zhigansk,30405,Valid,"Iron, IIIAB",900000.0,Found,1966.0,68.00000,128.30000,"(68.0, 128.3)"


In [6]:
(len(uncategorised) / len(df)) * 100

0.0

In [7]:
df['class'].value_counts()

L6          8339
H5          7164
L5          4817
H6          4529
H4          4222
            ... 
CH/CBb         1
H/L~4          1
LL3.7-6        1
H3.7/3.8       1
L/LL           1
Name: class, Length: 455, dtype: int64

In [14]:
df_c = df_c.dropna()

In [15]:
df_c = df_c.astype({"year": int})

In [17]:
df_c.to_csv('meteorites-clean.csv', index=False)

In [16]:
df_c.dtypes

name            object
id               int64
name_type       object
class           object
mass           float64
fall            object
year             int32
lat            float64
long           float64
geolocation     object
category        object
dtype: object