In [None]:
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import scipy.optimize as opt
import scipy.stats as stats

In [None]:
data = np.genfromtxt("metals.dat", names=True, dtype=None)

# print out all columns we just got for free
data.dtype

In [None]:
#First let's see what this data looks like

fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
ax.scatter(data['R_GC'],data['FE_H'],s=20,c='black')

# Cool! The data actually looks pretty linear so we can assume the function we are fitting *is* linear

In [None]:
# now let's perform a linear least squares regression on this data. 
# the function we'll be using comes from the stats module of scipy.
# the cool thing is that we don't need to define a function, just plug in the data and go!

fit = stats.linregress(data['R_GC'],data['FE_H'])
print (fit.slope)
print (fit.intercept)
print (fit.rvalue)

In [None]:
x = np.linspace(5,20,1000)

# let's make a function the normal way
def line(x,m,b):
    return m*x+b


# let's make the same function the more confusing and abstract way
line_l = lambda x,m,b: m*x+b

In [None]:


fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
ax.scatter(data['R_GC'],data['FE_H'],s=20,c='black')
ax.plot(x,func(x,fit.slope,fit.intercept))

# Exercise, set limits of the plot which make sense and play with the alpha values.
# label the axises (x = radius and y = metals)
# makes sure you axis labels and tick marks are the right sizes

In [None]:
# We seem to have an outlier so let's look at the residuals to confirm
residual = data['FE_H']-func(data['R_GC'],fit.slope,fit.intercept)
x = np.linspace(5,20,1000)

fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
ax.scatter(data['R_GC'],residual,s=15,c='black')
ax.plot(x,x*0.0,color='purple')
ax.set_xlim(6,16)

outlier = np.absolute(residual)>0.2
ax.scatter(data['R_GC'][outlier],residual[outlier],s=60,edgecolor='cyan',facecolor='none')

# Yep! that's an outlier!

In [None]:
# Exercise: Mask the data to exclude the outlier and rerun the linear regression. 
# plot both fits on the same plot to compare them.

fit_new = stats.linregress(data['R_GC'][~outlier],data['FE_H'][~outlier])
print (fit.slope)
print (fit.intercept)

fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
ax.scatter(data['R_GC'],data['FE_H'],s=20,c='black')
ax.plot(x,line(x,fit.slope,fit.intercept))
ax.plot(x,line(x,fit_new.slope,fit_new.intercept))

# check the corelation coefficient for the data set without the outlier


In [None]:
# Play around with the cutoff for the outliers and compare the function and the correlation coefficeients.

In [None]:
# Now the data does have error bars, which are a good idea to plot.
fig = plt.figure(figsize=(15,10))
ax = fig.add_subplot(111)
ax.scatter(data['R_GC'],data['FE_H'],s=20,c='black',zorder=2)
ax.errorbar(data['R_GC'],data['FE_H'],yerr=data['FE_H_ERR'],fmt='k.',ecolor='blue',zorder=0)

# along with out fits
ax.plot(x,line(x,fit.slope,fit.intercept))
ax.plot(x,line(x,fit_new.slope,fit_new.intercept))



In [None]:
# Exercise is to do the same thing for the oxygen data

In [None]:
# Challenge exercise to do the same thing (including removal of the outliers) using curve_fit