In [26]:
print('This notebook is essentially a pared down and re-organized version of all the (scratch) work found in milky_way_stuff.ipynb.')
print('Here you will find the data to recreate my project, as well as some functions to make the data preparation smoother.')

This notebook is essentially a pared down and re-organized version of all the (scratch) work found in milky_way_stuff.ipynb.
Here you will find the data to recreate my project, as well as some functions to make the data preparation smoother.


In [15]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sb
from wordcloud import WordCloud
import itertools
import astropy.coordinates as coord
from astropy.io import ascii
import astropy.units as u

%matplotlib inline
plt.style.use('ggplot')

##### *Download data*

##### Visit https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=PSCompPars to collect data



In [16]:
# Read in data

exos = pd.read_csv('data/PSCompPars_2021.07.12_12.50.05.csv')

In [17]:
# Functions!

In [18]:
def relevant_df_maker(df):
    '''Generates a dataframe containing the user-specified columns. You must use the exact spelling and punctuation of 
    the desired column name(s). The function will initially print the first row of the original dataframe, as well as the column count, for your convenience.
    
    # Inputs:
    df - a pandas dataframe
    
    # Outputs:
    new_df - a pandas dataframe'''
    
    col_list = list(df)
    print("\n".join(col_list))
    print('There are ' + str(len(col_list)) +' columns in your original dataframe.')
    
    num_fields = int(input('How many fields would you like in your new dataframe? '))
    
    fields = []
    
    for i in range(num_fields):
        field = input('Please type in the TABLE NAME of a column you would like to include in the new dataframe: ')
        fields.append(field)
        
    new_df = df[fields]
    
    return new_df

In [19]:
def metric_scatterplot_maker(data):
    '''Generates a scatterplot of the specified x and why variables, in a user-defined color.
    
    # Inputs:
    data - a pandas dataframe
    
    # Outputs:
     None'''
    
    x_metric = input('Please enter the column name of the x-axis data:')
    y_metric = input('Please enter the column name of the y-axis data:')
    color_val = input('Please specify the color of the marker:').lower()
    
    fig=plt.gcf()
    fig.set_size_inches(20,8)

    ax = sb.scatterplot(x=data[x_metric],
                        y=data[y_metric],
                        data=data,
                        color = color_val)

    #specfiy axis labels
    ax.set_xlabel(xlabel=x_metric, fontsize = 20)
    ax.set_ylabel(ylabel=y_metric,fontsize = 20)
    ax.set_title(label=f'Terrestrial Exoplanet {x_metric} vs. {y_metric}', fontsize=26)
    ax.grid(False)

In [20]:
def discmethod_dfcreator(df, discmethod):
    '''Returns a dataframe consisting of the input discovery method and all the years it produced discoveries, summing cumulatively from year to year.
    
    # Inputs:
    df - a pandas dataframe
    discmethod - the name of a discovery method as it appears in the "discoverymethod" field of the original dataframe
    
    # Outputs:
    new_df - a pandas dataframe'''
    
    
    new_df = df.loc[(df['discoverymethod'] == discmethod)]
    return new_df.groupby('disc_year').count().cumsum()

In [21]:
def bootstrap_means(data, n_samples=10000):
    '''A simple function to bootstrap data, if so desired.
    
    # Inputs:
    data - a pandas dataframe
    n_samples - int specifying the number of bootstrapped samples to create. Default is 10,000 bootstrap samples.
    
    # Outputs:
    bootstrap_samples_mean - list of length n_samples containing the mean of each bootstrapped sample'''
    
    bootstrap_samples_mean = []
    
    for _ in range(n_samples):
        bootstrap = np.random.choice(data, size=len(data), replace=True)
        bootstrap_samples_mean.append(np.mean(boot))
        
    return bootstrap_samples_mean

In [22]:
def metric_mean_distribution(data):
    '''Generates a normal distribution curve with crimson lines indicating the left and right bounds for a confidence interval specified by the user.
    The user must input a name for the metric being analyzed (to be used in the title of the chart), a confidence interval as a decimal, and the 
    display color of the distribution. If the relevant value for the Earth is known, those lines (referencing the variable "earth") may be un-commented 
    to add in the additional functionality of comparing exoplanet data to that of the Earth.
    
    # Inputs:
    data - a pandas dataframe, INCLUDING the column name of the metric you wish to examine. Use the format data['column_name']
    
    # Outputs:
    None returned, however the function will print the numeric values for mean, standard error, lower bound of the CI, and upper bound of the CI'''
    
    #establish the metric
    metric = input('What metric are you seeking the mean of?').capitalize()
    ci = float(input('What confidence interval do you seek? Please enter a percentage between 0 and 1.'))
    #earth = float(input("What is Earth's mean value for this statistic?"))
    colour = input('What color shall this plot be?')
    
    l_bound = (1-ci)/2
    u_bound = 1-l_bound
    
    metric_mean = np.mean(data)
    metric_std = np.std(data)
    metric_std_err = metric_std/np.sqrt(len(data))
    
    #confirm the metric and display the mean and standard error
    print(f'For {metric}, the mean is {metric_mean} and the standard error is {metric_std_err}.')
    
    #create distribution and determine confidence interval
    normal_metric = stats.norm(metric_mean,metric_std_err)
    l_bound_ci = normal_metric.ppf(l_bound)
    u_bound_ci = normal_metric.ppf(u_bound)
    
    print('The lower bound of the ' + str(ci*100) + f'% confidence interval is {l_bound_ci} and the upper bound is {u_bound_ci}')
    
    #plot it
    fig, ax = plt.subplots(1,figsize=(20,12))

    x = np.linspace(l_bound_ci-(l_bound_ci/8),u_bound_ci+(u_bound_ci/8),4500)
    ax.plot(x, normal_metric.pdf(x), color=colour)
    ax.axvline(l_bound_ci, ymin=0.045, color='crimson')
    ax.axvline(u_bound_ci, ymin=0.045, color='crimson')
    ax.fill_between(x,normal_metric.pdf(x), where=((x>=l_bound_ci)&(x<=u_bound_ci)), color=colour, alpha=0.5)
    #ax.axvline(earth, color='teal')
    ax.set_title(label=(f'Average Exoplanet {metric} w/ '+str(ci*100)+'% CI'), fontsize=20)

In [25]:
# Extras

# Create a dataframe that limits (less than, equal to, greater than, etc.) your data by a specific value in a specific column:
#your_data_limited_by_metric = output_of_relevant_df_maker[(output_of_relevant_df_maker['LIMITING_METRIC_COLUMN_NAME'] > int_here)].index



# Using Astropy to map the right ascension and declination of exoplanet systems:
'''
stars = ascii.read('data/PSCompPars_2021.07.12_12.50.05.csv')

ra = coord.Angle(stars['ra']*u.degree)
ra = ra.wrap_at(180*u.degree)
dec = coord.Angle(stars['dec']*u.degree)

fig = plt.gcf()
fig.set_size_inches(30,12)

ax = fig.add_subplot(111, projection="mollweide")
ax.scatter(ra.radian, dec.radian, s=5, color='brown')
ax.set_xlabel(xlabel='Right Ascension (Degrees)', fontsize = 20)
ax.set_ylabel(ylabel='Declination (Degrees)',fontsize = 20)
ax.set_title(label='Exoplanets As Seen From Earth', fontsize=26)
ax.grid(False)

plt.savefig('YOUR_PATH_HERE.png')
'''



# Count the number of occurrences of each value in a column:
#df['column_name'].value_counts()



# Create a dataframe focused on counting the top n occurrences of an event by the name of the place that event happened
'''
n = integer
prolific_n_counts = your_data['NAME_COLUMN'].value_counts()[:n].tolist()
prolific_n_names = your_data['NAME_COLUMN'].value_counts()[:n].index.tolist()

prolific_dict = dict(zip(prolific_n_names,prolific_n_counts))

prolific_df = pd.DataFrame(prolific_dict.items(), columns=['NAME', 'COUNT'])
'''

print('Thanks for starting here! Live long and prosper :)')

Thanks for starting here! Live long and prosper :)
