In [None]:
import os
import pandas as pd
import csv
import re
import numpy as np

In [None]:
# Open MB_pages with os and store all the paths to the pages in an sorted list
directory = r'MB_pages'
MB_pages = os.listdir(directory)
MB_path_pages= []
index = 0
while index < len(MB_data):
    page_path = os.path.join(directory, MB_data[index])
    MB_path_pages.append(page_path)
    index += 1
MB_path_pages = sorted(MB_path_pages, key=len)

In [None]:
def read_MB(page_path):
    """Read the table of a MB database page and return rearranged dataframe """
    df = pd.read_csv(page_path, sep = '|')
    
    # Drop uneccessary column
    if 'Antarctic' in df.columns:  
        df = df.drop('Antarctic', axis=1)

    # Change names and order of columns and add empty columns
    df['Mass (g)'] = df['Mass']
    df = df.drop('Mass', axis=1)
    df['Country'] = ''
    df['Long'] = ''
    df['Lat'] = ''
    df['GeoLocation'] = ''
    df = df.iloc[:, [0, 1, 2, 3, 8, 4, 7, 5, 6, 9, 10]]
    
    return df


In [None]:
# Convert every page of the MB database to a dataframe and concate them
df = read_MB(MB_path_pages[0])
for i in range(1, len(MB_path_pages)):
    df_page = read_MB(MB_path_pages[i])
    df = pd.concat([df, df_page], ignore_index=True, sort=False)


In [None]:
# For each row redefine mass and year
for i, row in df.iterrows():
    mass = str(row['Mass (g)'])
    try:
        df.loc[i, 'Mass (g)']  = float(mass)
        if float(mass) == 0.0:
            df.loc[i, 'Mass (g)']  = np.nan
    except ValueError:
        match = re.findall(r"[-+]?(?:\d*\.*\d+)", mass)
        if match:
            if float(match[0]) == 0.0:
                df.loc[i, 'Mass (g)'] = np.nan  
            elif mass[-2:] == 'kg':
                df.loc[i, 'Mass (g)'] = float(match[0])*1000
            elif mass[-2:] == ' g':
                df.loc[i, 'Mass (g)'] = float(match[0])
            elif mass[-2:] == 'mg':
                df.loc[i, 'Mass (g)'] = float(match[0]) / 1000
            elif mass[-2:] == ' t':
                df.loc[i, 'Mass (g)'] = float(match[0])*1000000 
            else:
                df.loc[i, 'Mass (g)'] = np.nan  
        else:
            df.loc[i, 'Mass (g)'] = np.nan
    
    year = str(row['Year'])
    try:
        df.loc[i, 'Year']  = int(year)
    except ValueError:
        match = re.search(r'\d+', year)
        if match != None:
            df.loc[i, 'Year'] = int(match.group())
        else:
            df = df.drop(i) 
df = df.reset_index(drop=True)

In [None]:
# For each row redefine and filter fall,(lat, long), type, country
for i, row in df.iterrows():
    
    fall = row['Fall']
    if fall == 'Y' or fall == 'Yc' or fall == 'Yp':
        df.loc[i, 'Fall'] = 'Fell'
    else:
        df.loc[i, 'Fall'] = 'Found'
        
    coords = row['(Lat,Long)']
    match = re.search('\(.*?\)', coords)
    if match != None:
        coords = coords[match.start() +1 :match.end() -1]
        df.loc[i, 'GeoLocation'] = coords
        coords = tuple(map(str, coords.split(', ')))
        df.loc[i, 'Lat'] = float(coords[0])
        df.loc[i, 'Long'] = float(coords[1])
    else:
        df.loc[i, '(Lat,Long)'] = ''
        df.loc[i, 'GeoLocation'] = ''
        df.loc[i, 'Lat'] = np.nan
        df.loc[i, 'Long'] = np.nan
    
    Type = row['Type']
    if '&sect;' in Type:
        Type = Type[:Type.find('&')]
        df.loc[i, 'Type']  = Type
    if '#' in Type:
        Type = Type[:Type.find('#')]
        df.loc[i, 'Type']  = Type
    if '&para;' in Type:
        Type = Type[:Type.find('&')]
        df.loc[i, 'Type']  = Type
        
    country = str(row['Place'])
    if country != "nan" or country != '' or country != ' ':
        country = tuple(map(str, country.split(', ')))
        if len(country) != 0:
            if country[-1] == "(Sahara)":
                df.loc[i, 'Country'] = "Sahara"
            elif country[-1] == "(Nothwest Africa)" or country[-1] == "(Northwest Africa)":
                df.loc[i, 'Country'] = "Northwest Africa"
            elif country[-1] == "(Northeast Africa)":
                df.loc[i, 'Country'] = "Northeast Africa"
            elif country[-1] == "unknown" or country[-1] == "(Unknown)" or country[-1] == "Unknown" or country[-1] == "(unknown)":
                df.loc[i, 'Country'] = ""
            elif country[-1] == "Morocco?" or country[-1] == "Morocco (Erfoud)":
                df.loc[i, 'Country'] = "Morocco"
            elif country[-1] == "United States?" or country[-1] == "United States":
                df.loc[i, 'Country'] = "USA"
            elif "?" in country[-1]:
                df.loc[i, 'Country'] = country[-1].split('?')[0]
            elif country[-1] == "Mars" or country[-1] == "Moon":
                df = df.drop(i)
            else:
                df.loc[i, 'Country'] = country[-1]
        else:
            df.loc[i, 'Country'] = ''
    else: 
        df.loc[i, 'Country'] = ''

df = df.drop('(Lat,Long)', axis=1)
df = df.reset_index(drop=True)

In [None]:
# Convert columns to right datatype
df["Mass (g)"] = pd.to_numeric(df["Mass (g)"])
df["Year"] = pd.to_numeric(df["Year"])
df["Lat"] = pd.to_numeric(df["Lat"])
df["Long"] = pd.to_numeric(df["Long"])
df = df.fillna(np.nan)
print(df.info())
df

In [None]:
# Safe dataframe as csv-file
df.to_csv('MB_meteorite_data.csv', sep='|', index=False, quoting=csv.QUOTE_NONNUMERIC)