# Hypothesis 2-mass on each class MB data

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap # pip install basemap
import matplotlib as mpl
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp

In [None]:
def theoretical_lognorm(mu, sigma, x):
    return 1 / (sigma * np.sqrt(2 * np.pi)) * np.exp(-(x- mu)**2 / (2 * sigma**2))

# Load dataset
dataset = pd.read_csv(r"Meteoritical Bulletin Database\MB_meteorite_data.csv", sep = '|')
dataset_nasa = pd.read_csv(r"NASA database\Meteorite_Landings.csv", sep = ',')

# print(dataset.head())
# print(dataset_nasa.head())

In [None]:
# Discard incorrect coordinates
dataset = dataset[dataset["Long"] < 180]
dataset = dataset[dataset["Long"] > -180]
dataset = dataset[dataset["Lat"] < 90]
dataset = dataset[dataset["Lat"] > -90]

dataset_nasa = dataset_nasa[dataset_nasa['reclong'] < 180]
dataset_nasa = dataset_nasa[dataset_nasa['reclong'] > -180]
dataset_nasa = dataset_nasa[dataset_nasa['reclat'] < 90]
dataset_nasa = dataset_nasa[dataset_nasa['reclat'] > -90]


# Split dataset into fallen and found meteorites
fallen = dataset_nasa[dataset_nasa["fall"] == "Fell"].dropna()
found = dataset[dataset["Fall"] == "Found"].dropna()
classes = fallen['recclass'].value_counts().head(15)
print(classes)

In [None]:
for class_ in classes.index:
    fallen_class = fallen[fallen["recclass"] == class_]
    log_mass_fallen = np.log10(fallen_class["mass (g)"])
    
    # Extract the logarithm of the meteorite masses. Plot the histograms and create CDFs
    histfall = plt.hist(log_mass_fallen, bins=100, density=True, alpha=0.5, label='Fallen')
    fallen_cdf = np.cumsum(histfall[0]) / np.sum(histfall[0])

    found_class = found[found["Type"] == class_]
    log_mass_found = np.log10(found_class["Mass (g)"])
    log_mass_found = log_mass_found[log_mass_found > -10]
    histfound = plt.hist(log_mass_found, bins=100, density=True, alpha=0.5, label='Found')
    found_cdf = np.cumsum(histfound[0]) / np.sum(histfound[0])
    plt.xlabel('Mass (log10(g))')
    plt.ylabel('Probability density')
    plt.title(f'Mass distribution of fallen and found meteorites of class: {class_}')

    # Determine the mean and standard deviation of the mass distributions. Add a vertical line for the means.
    # Fit a normal distribution to the mass distributions and plot the fitted distributions.
    fallen_mean_mass = np.mean(log_mass_fallen)
    fallen_std_mass = np.std(log_mass_fallen)
    found_mean_mass = np.mean(log_mass_found)
    found_std_mass = np.std(log_mass_found)
    space=np.linspace(-2, 10, 1000)
    plt.axvline(x=fallen_mean_mass, color='b', linestyle='--')
    plt.axvline(x=found_mean_mass, color='r', linestyle='--')

    fit_fallen = theoretical_lognorm(fallen_mean_mass, fallen_std_mass, space)
    fit_found = theoretical_lognorm(found_mean_mass, found_std_mass, space)

    # Plot fitted normal distribution
    plt.plot(space, fit_found, color='r', label='(Log10) Normal distribution fitted to found meteorites')
    plt.plot(space, fit_fallen, color='b', label='(Log10) Normal distribution fitted to fallen meteorites')
    plt.legend()
    plt.show()
    plt.clf()

    # Different bootstrap sample sizes
    bootstrap_sizes = [5, 25, 100, 200, 500, len(log_mass_found)]
    bootstrap_means = []
    pval_means = []

    for size in bootstrap_sizes:
        bootstrap_means_for_size = []
        pvals = []
        # Calculate bootstrap means and p-values for each bootstrap sample size
        for _ in range(100):
            bootstrap_sample = np.random.choice(log_mass_found, size=size, replace=True)
            bootstrap_mean = np.mean(bootstrap_sample)
            bootstrap_means_for_size.append(bootstrap_mean)
            _, p_value = ks_2samp(log_mass_fallen, bootstrap_sample)
            pvals.append(p_value)
        pval_means.append(np.mean(pvals))
        bootstrap_means.append(bootstrap_means_for_size)

    # For demonstration, plot the histograms of the bootstrap means and the 95% confidence interval for the largest sample size.
    plt.hist(bootstrap_means[-1], bins=20, density=True, alpha=0.7, label='Means of bootstrap samples')
    plt.axvline(x=np.percentile(bootstrap_means[-1], 2.5), color='r', linestyle='--', label='95% confidence interval')
    plt.axvline(x=np.percentile(bootstrap_means[-1], 97.5), color='r', linestyle='--')
    plt.axvline(x=fallen_mean_mass, color='r', label=f'Mean of fallen meteorites of class: {class_}')
    plt.title('Confidence interval test using the bootstrapped means of the mass distribution of found meteorites')
    plt.legend()
    plt.show()
    plt.clf()

    # Plot the average p-values for the different bootstrap sample sizes
    plt.plot([str(size) for size in bootstrap_sizes], pval_means)
    plt.title('p-value of KS test for different bootstrap sample sizes')
    plt.xlabel('Bootstrap sample size')
    plt.ylabel('p-value')
    plt.show()
    plt.clf()

    # Create histograms and cdfs of the fitted distributions
    fit_fallen_dist = np.random.normal(fallen_mean_mass, fallen_std_mass, 5000)
    fit_found_dist = np.random.normal(found_mean_mass, found_std_mass, 5000)

    fit_fallen_hist, _ = np.histogram(fit_fallen_dist, bins=100, density=True)
    fit_fallen_cdf = np.cumsum(fit_fallen_hist) / np.sum(fit_fallen_hist)

    fit_found_hist, _ = np.histogram(fit_found_dist, bins=100, density=True)
    fit_found_cdf = np.cumsum(fit_found_hist) / np.sum(fit_found_hist)

    # Plot the cdfs of the fitted distributions and the original distributions
    plt.figure(figsize=[10, 6])
    plt.subplot(1, 2, 1)
    plt.title('CDFs of fitted and original fallen distribution')
    plt.plot(fit_fallen_cdf, label='Fitted CDF')
    plt.plot(fallen_cdf, label='CDF')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.title('CDFs of fitted and original found distribution')
    plt.plot(fit_found_cdf, label='Fitted CDF')
    plt.plot(found_cdf, label='CDF')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Run KS test on the fitted distributions, comparing them to the real distributions
    pvals_fallen = []
    pvals_found = []

    for _ in range(200):
        fit_fallen_dist = np.random.normal(fallen_mean_mass, fallen_std_mass, 5000)
        fit_found_dist = np.random.normal(found_mean_mass, found_std_mass, 5000)

        pvals_fallen.append(ks_2samp(fit_fallen_dist, log_mass_fallen)[1])
        pvals_found.append(ks_2samp(fit_found_dist, log_mass_found)[1])

    print("p-value of KS test for fallen distribution: ", np.mean(pvals_fallen))
    print("p-value of KS test for found distribution: ", np.mean(pvals_found))

    print("This means the mass of the fallen meteorites follows the distribution with mean", fallen_mean_mass, "and standard deviation", fallen_std_mass) 


# Hypothesis 2-location on each class in MB data

In [None]:


# Discard incorrect coordinates
dataset = dataset[dataset["Long"] < 180]
dataset = dataset[dataset["Long"] > -180]
dataset = dataset[dataset["Lat"] < 90]
dataset = dataset[dataset["Lat"] > -90]
dataset = dataset[dataset['GeoLocation'] != ""]

dataset_nasa = dataset_nasa[dataset_nasa['reclong'] < 180]
dataset_nasa = dataset_nasa[dataset_nasa['reclong'] > -180]
dataset_nasa = dataset_nasa[dataset_nasa['reclat'] < 90]
dataset_nasa = dataset_nasa[dataset_nasa['reclat'] > -90]
dataset_nasa = dataset_nasa[dataset_nasa['GeoLocation'] != "(0.0, 0.0)"]


# Split into fallen and found meteorites. Plot the locations of the meteorites.
fallen = dataset_nasa[dataset_nasa['fall'] == 'Fell']
found = dataset[dataset['Fall'] == 'Found']

In [None]:
classes = fallen['recclass'].value_counts().head(15)

for class_ in classes.index:
    fallen_class = fallen[fallen["recclass"] == class_]
    found_class = found[found["Type"] == class_]
    
    plt.plot(fallen_class['reclong'], fallen_class['reclat'], 'ro', markersize=1)
    plt.show()
    plt.clf()

    # Plot a histogram of the locations of fallen meteorites.
    fallhist = np.histogram2d(-fallen_class['reclat'], fallen_class['reclong'], bins=50, range=[[-90, 90], [-180, 180]])
    plt.imshow(fallhist[0], extent=[-180, 180, -90, 90], norm=mpl.colors.LogNorm(), cmap='Greens')
    plt.colorbar()
    plt.title(f'Location of fallen meteorites of class: {class_}')
    plt.show()
    plt.clf()

    # Plot a histogram of the locations of found meteorites.
    foundhist = np.histogram2d(-found_class['Lat'], found_class['Long'], bins=50, range=[[-90, 90], [-180, 180]])
    plt.imshow(foundhist[0], extent=[-180, 180, -90, 90], norm=mpl.colors.LogNorm(), cmap='Greens')
    plt.colorbar()
    plt.title(f'Location of found meteorites of class: {class_}')
    plt.show()
    plt.clf()

    # Determine the mean latitude and longitude of the fallen and found meteorites. Plot the means on a map.
    fallen_lat_mean = fallen_class['reclat'].mean()
    fallen_long_mean = fallen_class['reclong'].mean()
    found_lat_mean = found_class['Lat'].mean()
    found_long_mean = found_class['Long'].mean()

    # Prepare map
    map = Basemap(projection='cyl')
    map.drawmapboundary(fill_color='w')
    map.drawcoastlines()

    plt.axhline(y=fallen_lat_mean, color='b', linestyle='-', label='Mean location of fallen meteorites')
    plt.axvline(x=fallen_long_mean, color='b', linestyle='-')
    plt.axhline(y=found_lat_mean, color='r', linestyle='-', label='Mean location of found meteorites')
    plt.axvline(x=found_long_mean, color='r', linestyle='-')
    plt.legend()
    plt.show()

    # Perform a Kolmogorov-Smirnov test to determine whether the latitude and longitude
    # of the fallen and found meteorites come from the same distribution.
    bootstrap_sizes = [5, 25, 100, 200, 500, 1000]
    bootstrap_lat_means = []
    bootstrap_long_means = []
    pvals_lat_means = []
    pvals_long_means = []

    for size in bootstrap_sizes:
        bootstrap_lat_means_for_size = []
        bootstrap_long_means_for_size = []
        pvals_lat = []
        pvals_long = []
        for _ in range(100):
            bootstrap_indices = np.random.choice(range(len(found_class)), size=size, replace=True)
            bootstrap_sample = found_class.iloc[bootstrap_indices]
            bootstrap_lat_mean = np.mean(bootstrap_sample['Lat'])
            bootstrap_long_mean = np.mean(bootstrap_sample['Long'])
            bootstrap_lat_means_for_size.append(bootstrap_lat_mean)
            bootstrap_long_means_for_size.append(bootstrap_long_mean)
            _, p_value_lat = ks_2samp(fallen_class['reclat'], bootstrap_sample['Lat'])
            _, p_value_long = ks_2samp(fallen_class['reclong'], bootstrap_sample['Long'])
            pvals_lat.append(p_value_lat)
            pvals_long.append(p_value_long)
        pvals_lat_means.append(np.mean(pvals_lat))
        pvals_long_means.append(np.mean(pvals_long))
        bootstrap_lat_means.append(bootstrap_lat_means_for_size)
        bootstrap_long_means.append(bootstrap_long_means_for_size)

    # Plot the histogram of the bootstrap means and the mean locations of the fallen meteorites.
    hist = np.histogram2d([-lat for lat in bootstrap_lat_means[-1]], bootstrap_long_means[-1], bins=50, range=[[-90, 90], [-180, 180]])
    plt.imshow(hist[0], extent=[-180, 180, -90, 90], norm=mpl.colors.LogNorm(), cmap='Greens')
    plt.axvline(x=np.percentile(bootstrap_long_means[-1], 2.5), color='r', linestyle='--', label='95% confidence interval of found meteorite means')
    plt.axvline(x=np.percentile(bootstrap_long_means[-1], 97.5), color='r', linestyle='--')
    plt.axhline(y=np.percentile(bootstrap_lat_means[-1], 2.5), color='r', linestyle='--')
    plt.axhline(y=np.percentile(bootstrap_lat_means[-1], 97.5), color='r', linestyle='--')

    plt.axhline(y=fallen_lat_mean, color='b', linestyle='-' , label=f'Mean location of fallen meteorites of class: {class_}')
    plt.axvline(x=fallen_long_mean, color='b', linestyle='-')
    plt.legend()
    plt.show()
    plt.clf()

    # Plot the average p-values for the different bootstrap sample sizes for latitude and longitude.
    plt.plot([str(size) for size in bootstrap_sizes], pvals_lat_means, label='p-value for latitude for bootstrap sample size')
    plt.plot([str(size) for size in bootstrap_sizes], pvals_long_means, label='p-value for longitude for bootstrap sample size')
    plt.title('p-values for latitude and longitude for different bootstrap sample sizes')
    plt.legend()
    plt.show()