# QASSIS: Quality Algorithm to Search Stuff in Spectra

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.signal as sig
from lmfit import minimize, Parameters, fit_report, Model, Parameter
import corner as corner
#%matplotlib inline
#from pylab import rcParams
#rcParams['figure.figsize'] = 15, 10
%matplotlib qt5

In [52]:
# Get spectrum file
spectrum_file = "181_WSW_FTS200_3mm_average_data_Tmb.dat"
spectrum = np.loadtxt(spectrum_file)

frequencies = spectrum[:,0]
intensities = spectrum[:,1]

# Get molecular catalog file
molecule_name = "CH3OH"
molecule_file = "CH3OH.txt"
molecule_catalog = pd.read_csv(molecule_file, delimiter="\t", header=0, index_col=False)

molecule_frequencies = molecule_catalog["Frequency(Ghz)"]
molecule_intensities = molecule_catalog["Intensity(K)"]
molecule_eup = molecule_catalog["Eup(K)"]
molecule_aij = molecule_catalog["Aij"]
molecule_transition = molecule_catalog["Transition"]

### OPTIONAL: Redshift spectrum by $v_{lsr}$ and convert to velocity units

In [53]:
reference_frequency = 93750.7299
vlsr = 7.1e+3 # m/s
c = 2.998*1e8 # m/s 

def redshift(v):
    return np.sqrt( (1 + (v/c)) / (1 - (v/c)) ) - 1 # = z

frequencies_shifted = np.array(frequencies)/(1+redshift(-6.8e+3))
frequencies_shifted = np.array(frequencies_shifted)/(1+redshift(vlsr))

def freq_to_vel(freqs):
    return np.array((c*reference_frequency - c*freqs) / reference_frequency)
   
velocities = freq_to_vel(frequencies_shifted)/1000 # km/s
plt.step(velocities[intensities>-100], intensities[intensities>-100], color="black", linewidth=0.5)

plt.xlabel("Velocity [km/s]", fontsize=15)
plt.ylabel("Intensity [K]", fontsize=15)


frequencies = freq_to_vel(frequencies_shifted)/1000
molecule_frequencies = freq_to_vel(molecule_catalog["Frequency(Ghz)"])/1000

plt.vlines(molecule_frequencies, 0,1, color="red")
plt.show()


In [54]:
# Find possible emission lines
width = 1

peaks = sig.find_peaks(intensities)[0]
peak_intensities = intensities[peaks]
peak_frequencies = frequencies[peaks]

noise = 2*5e-3
rms = [5*5e-3 for p in peak_frequencies]

emission_line_frequencies = []
emission_line_intensities = []
emission_line_eup = []
emission_line_aij = []
emission_line_transition = []

for i,f1 in enumerate(peak_frequencies):
    for j,f2 in enumerate(molecule_frequencies):
        if abs(f1-f2) < width and peak_intensities[i]>rms[i]:
            emission_line_frequencies.append(f1)
            emission_line_intensities.append(peak_intensities[i])
            emission_line_eup.append(molecule_eup[j])
            emission_line_aij.append(molecule_aij[j])
            emission_line_transition.append(molecule_transition[j])
            
print(emission_line_frequencies)

[29514.733809426158, -9557.194615663433, -9563.440633957558, -9573.434199271323, -48425.506745728555]


# Fit Gaussians to Emission Lines


In [56]:
# Define a window for each line

window_size = 15 # default 5

window_frequencies = []
window_intensities = []

for f in emission_line_frequencies:
    window_f = frequencies[abs(frequencies-f)<window_size]
    window_i = intensities[abs(frequencies-f)<window_size]
    window_frequencies.append(window_f)
    window_intensities.append(window_i)
    
# For each window, define what the BASELINE is

baseline_frequencies = []
baseline_intensities = []

for i,window in enumerate(window_frequencies):
    baseline_window_frequencies = []
    baseline_window_intensities = []
    for j,freq in enumerate(window):
        if window_intensities[i][j] <= abs(noise) and abs(freq-emission_line_frequencies[i]) > 2*width:
            baseline_window_frequencies.append(freq)
            baseline_window_intensities.append(window_intensities[i][j])
    baseline_frequencies.append(baseline_window_frequencies)
    baseline_intensities.append(baseline_window_intensities)
    
# Fit a polynomial to the baseline
degree = 2

baseline_polynomials = []

for i,baseline_window_frequencies in enumerate(baseline_frequencies):
    baseline_fit = np.polyfit(baseline_window_frequencies, baseline_intensities[i], degree)
    baseline = np.poly1d(baseline_fit)
    baseline_polynomials.append(baseline)
    
# Subtract baseline polynomial from window intensities

window_intensities_corrected = []

for i,window in enumerate(window_intensities):
    window_intensities_corrected.append(window_intensities[i]-baseline_polynomials[i](window_frequencies[i]))
    
# For each window, define what the LINE is

line_frequencies = []
line_intensities = []

for i,window in enumerate(window_frequencies):
    line_window_frequencies = []
    line_window_intensities = []
    for j,freq in enumerate(window):
        if abs(freq-emission_line_frequencies[i]) <= 4*width and window_intensities_corrected[i][j]>=0:
            line_window_frequencies.append(freq)
            line_window_intensities.append(window_intensities_corrected[i][j])
    line_frequencies.append(line_window_frequencies)
    line_intensities.append(line_window_intensities)
    
# Fit a Gaussian to the line

gaussian_fits = []

def gaussian(x, amp, cen, wid):
    """1-d gaussian: gaussian(x, amp, cen, wid)"""
    return (amp / (np.sqrt(2*np.pi) * wid)) * np.exp(-(x-cen)**2 / (2*wid**2))

for i,line_f in enumerate(line_frequencies):
    x = line_frequencies[i]
    y = line_intensities[i]
    
    # Do least squares first
    gmodel = Model(gaussian)
    params = gmodel.make_params()
    params["amp"] = Parameter(name='amp', value=0.01)
    params["cen"] = Parameter(name='cen', value=emission_line_frequencies[i], min=emission_line_frequencies[i]-3, max=emission_line_frequencies[i]+3)
    params["wid"] = Parameter(name='wid', value=width)
    
    result = gmodel.fit(y, x=x, params=params, method="leastsq")
    
    # Now do MCMC
    
    emcee_kws = dict(steps=5000, burn=100, thin=10)
    
    amp = result.best_values["amp"]
    cen = result.best_values["cen"]
    wid = result.best_values["wid"]
    
    gmodel = Model(gaussian)
    params = gmodel.make_params()
    params["amp"] = Parameter(name='amp', value=amp, min=max(0,amp-1*result.params["amp"].stderr), max=amp+1*result.params["amp"].stderr)
    params["cen"] = Parameter(name='cen', value=cen, min=cen-2, max=cen+2)
    params["wid"] = Parameter(name='wid', value=wid, min=max(0,wid-1*result.params["wid"].stderr), max=wid+1*result.params["wid"].stderr)
    
    result = gmodel.fit(y, x=x, params=params, method="emcee", fit_kws=emcee_kws)

    
    
    gaussian_fits.append(result)
    print(result.fit_report())

# Define a linspace in each window for plotting purposes

window_linspaces = []
for i,window in enumerate(window_frequencies):
    window_linspaces.append(np.linspace(min(window), max(window),10000))

100%|██████████| 5000/5000 [00:33<00:00, 149.87it/s]
  0%|          | 13/5000 [00:00<00:40, 122.40it/s]

[[Model]]
    Model(gaussian)
[[Fit Statistics]]
    # fitting method   = emcee
    # function evals   = 500000
    # data points      = 8
    # variables        = 3
    chi-square         = 2.3418e-04
    reduced chi-square = 4.6835e-05
    Akaike info crit   = -77.5110579
    Bayesian info crit = -77.2727333
[[Variables]]
    amp:  0.02369421 +/- 0.00522440 (22.05%) (init = 0.02369403)
    cen:  29514.7417 +/- 1.39698805 (0.00%) (init = 29514.79)
    wid:  0.37101388 +/- 0.07656311 (20.64%) (init = 0.3710833)


100%|██████████| 5000/5000 [00:33<00:00, 150.69it/s]


[[Model]]
    Model(gaussian)
[[Fit Statistics]]
    # fitting method   = emcee
    # function evals   = 500000
    # data points      = 8
    # variables        = 3
    chi-square         = 3.1421e-04
    reduced chi-square = 6.2842e-05
    Akaike info crit   = -75.1591062
    Bayesian info crit = -74.9207816
[[Variables]]
    amp:  0.29626886 +/- 0.02668162 (9.01%) (init = 0.2961144)
    cen: -9557.16358 +/- 1.35365662 (0.01%) (init = -9557.188)
    wid:  0.43198226 +/- 0.03876783 (8.97%) (init = 0.4318201)


TypeError: unsupported operand type(s) for *: 'int' and 'NoneType'

## Check best fits

In [10]:
for i,line in enumerate(window_intensities):

    plt.subplot2grid((2,3),(i%2,i%3))
    plt.step(window_frequencies[i], window_intensities[i], linewidth=2, c="black")
    plt.step(baseline_frequencies[i], baseline_intensities[i], linewidth=4, c="blue", alpha=0.4)
    plt.scatter(emission_line_frequencies[i], emission_line_intensities[i], s=50, c="red")

    F_linspace = np.linspace(min(window_frequencies[i]), max(window_frequencies[i]),100)
    plt.plot(F_linspace, baseline_polynomials[i](F_linspace), color="blue", linestyle="--", linewidth=3, alpha=0.8)


    plt.step(line_frequencies[i], line_intensities[i], linewidth=5, c="lime", alpha=0.5)

    plt.step(window_frequencies[i], window_intensities_corrected[i], c="red", linewidth=1)

    amp = np.median(gaussian_fits[i].flatchain["amp"])
    cen = np.median(gaussian_fits[i].flatchain["cen"])
    wid = np.median(gaussian_fits[i].flatchain["wid"])
    
    #amp = gaussian_fits[i].best_values["amp"]
    #cen = gaussian_fits[i].best_values["cen"]
    #wid = gaussian_fits[i].best_values["wid"]
    
    gauss = gaussian(window_linspaces[i], amp, cen, wid)
    plt.plot(window_linspaces[i], gauss, linestyle='--', color="magenta", label='best fit', linewidth=3)

    plt.fill_between(window_linspaces[i], y1=[0 for x in window_linspaces[i]], y2 =gauss , color="magenta", alpha=0.5)

plt.xlabel("Velocity [km/s]", fontsize=15)
plt.ylabel("Intensity [K]", fontsize=15)

plt.show()

## Correlation plots to double check _emcee_ didn't go insane

In [10]:
for i,line in enumerate(window_intensities):
    amp = np.median(gaussian_fits[i].flatchain["amp"])
    cen = np.median(gaussian_fits[i].flatchain["cen"])
    wid = np.median(gaussian_fits[i].flatchain["wid"])
    
    amp_err = np.std(gaussian_fits[i].flatchain["amp"])
    cen_err = np.std(gaussian_fits[i].flatchain["cen"])
    wid_err = np.std(gaussian_fits[i].flatchain["wid"])
    
    corner.corner(gaussian_fits[i].flatchain, bins=20, color="indigo", smooth=True, smooth1d=True,
             plot_datapoints=True, plot_density=False, fill_contours=True, levels=[0.67, 0.95,0.99],
                 range=[(amp-10*amp_err, amp+10*amp_err),(cen-10*cen_err, cen+10*cen_err),(wid-10*wid_err, wid+10*wid_err)],
                 labels=["Amplitude", "Center", "Gaussian Width"])

    
    
plt.show()
    

### Derive Rotation Diagrams

In [11]:
# EVERYTHING IN CGS UNITS

# Boltzmann Constant
k = 1.3807 * 1e-16 
# Dipole moment

# CH3CN
mu = 3.9037 * 1e-18

# CH3OH
# mu = np.sqrt(0.899**2 + (-1.44)**2) * 1e-18

# Transition strength
# CH3CN
strength = np.array([79.48062, 84.80078, 101.74150, 105.98313, 123.64362, 127.17758])/(3.9037**2)

# CH3OH
# strength = np.array([3.08327, 1.21365, 1.61697, 1.61649, 0.97862]) / (np.sqrt(0.899**2 + (-1.44)**2))


# The integral from -inf to inf is the amp parameter. Convert to K.cm/s

flux = [np.median(fit.flatchain["amp"])*100000 for fit in gaussian_fits] # in K . cm/s
flux_err = [np.std(fit.flatchain["amp"])*100000 for fit in gaussian_fits] # in K . cm/s

centers = [reference_frequency*(c-np.median(fit.flatchain["cen"])*1000)/c *1e6 for fit in gaussian_fits]

# Upper-level populations Nu/gu
# factor of 10000 ti
lnnugu = np.log((3*k*np.array(flux))/(8*(np.pi**3)*np.array(centers)*(mu**2)*np.array(strength)))

nugu_err = ((3*k*np.array(flux_err))/(8*(np.pi**3)*np.array(centers)*(mu**2)*np.array(strength)))
lnnugu_err = nugu_err / np.exp(lnnugu)

emission_line_eup = np.array(emission_line_eup)

In [12]:
print(centers)

[73588715931.59872, 73590183331.40918, 91985239941.90466, 91987015895.21033, 110381218302.51036, 110383408417.44641]


## Fit Rotation Diagram

In [13]:
def residual(params, eu, data, eps_data):
    m = -1/params['T']
    b = params['lnNugu_0']
  
    model = m*eu+b

    return (data-model) / eps_data


params = Parameters()
params.add('T', value=10, min=0, max=100)
params.add('lnNugu_0', value=20, min=15, max=30)


out = minimize(residual, params, args=(np.array(emission_line_eup), lnnugu, lnnugu_err), method="leastsq")

T = out.params["T"]
lnNugu_0 = out.params["lnNugu_0"]

params = Parameters()
params.add('T', value=T, min=T-4*T.stderr, max=T+4*T.stderr)
params.add('lnNugu_0', value=lnNugu_0, min=lnNugu_0-4*lnNugu_0.stderr, max=lnNugu_0+4*lnNugu_0.stderr)

out = minimize(residual, params, args=(np.array(emission_line_eup), lnnugu, lnnugu_err), method="emcee", steps=4000, burn=400, thin=10)


print(fit_report(out))

T = np.median(out.flatchain["T"])
lnNugu_0 = np.median(out.flatchain["lnNugu_0"])

Eu_linspace = np.linspace(min(emission_line_eup), max(emission_line_eup), 1000)

def rotdiag_model(Eu, T, lnNugu_0):
    return lnNugu_0 - (1/T)*Eu

plt.scatter(emission_line_eup, lnnugu, c="purple")

plt.errorbar(emission_line_eup, lnnugu, yerr=lnnugu_err, capsize=5, c="purple", fmt="none")

plt.plot(Eu_linspace, rotdiag_model(Eu_linspace, T, lnNugu_0), color="indigo", linestyle="--")
    
plt.ylabel("$\ln(N_u/g_u)$", fontsize=15)
plt.xlabel("$E_u/k$", fontsize=15)
plt.show()
    


100%|██████████| 4000/4000 [00:31<00:00, 125.38it/s]


[[Fit Statistics]]
    # fitting method   = emcee
    # function evals   = 400000
    # data points      = 6
    # variables        = 2
    chi-square         = 194.339289
    reduced chi-square = 48.5848222
    Akaike info crit   = 24.8670764
    Bayesian info crit = 24.4505954
[[Variables]]
    T:         18.3965719 +/- 1.55767373 (8.47%) (init = 18.15047)
    lnNugu_0:  21.6875720 +/- 0.08179313 (0.38%) (init = 21.7004)
[[Correlations]] (unreported correlations are < 0.100)
    C(T, lnNugu_0) = -0.961


In [20]:
T_err = np.std(out.flatchain["T"])
lnNugu_0_err = np.std(out.flatchain["lnNugu_0"])
    
    
corner.corner(out.flatchain, bins=20, color="darkcyan", smooth=True, smooth1d=True,
             plot_datapoints=True, plot_density=False, fill_contours=True, levels=[0.67, 0.95,0.99],
                 range=[(T-10*T_err, T+10*T_err),(lnNugu_0-10*lnNugu_0_err, lnNugu_0+10*lnNugu_0_err)],
                 labels=["$T_{rot}$", "$\ln(N_u/g_u)$"])

plt.show()

## Derive total column density $N_{tot}$ by approximating the rotational partition function $Q_{rot}$

In [21]:
# CH3CN
Qrot = [13.8355, 28.4924, 64.0955, 164.3168, 449.0811, 1267.6705, 2628.0493]
Trot = [2.725, 5.0, 9.375, 18.75, 37.5, 75.0, 120.0]

# CH3OH
#Qrot = [11.889916, 26.719018, 78.173628, 274.987967, 920.963739, 2924.302297, 9750.039754]
#Trot = [2.725, 5.0, 9.375, 18.75, 37.5, 75.0, 150.0]

Qfit = np.polyfit(Trot, Qrot, 3)
Q = np.poly1d(Qfit)

T_linspace = np.linspace(min(Trot), max(Trot))
plt.scatter(Trot, Qrot, c="black")
plt.plot(T_linspace, Q(T_linspace), color="black")

Qerr = np.sqrt((T*T_err)*(3*(Qfit[0]**2)*(T**4)+2*Qfit[1]*T**2+Qfit[1]**2))

plt.errorbar([T], [Q(T)], yerr=Qerr , c="red", fmt="*", capsize=3)

plt.ylabel("$Q_{rot}$", fontsize=15)
plt.xlabel("$T$", fontsize=15)
plt.show()

In [22]:
lnNtot = lnNugu_0 + np.log(Q(T))
lnNtot_err = np.sqrt((lnNugu_0_err)**2 + (Qerr/Q(T))**2)

logNtot = np.log10(np.exp(lnNtot))
logNtot_err = 0.434*(np.exp(lnNtot)*lnNtot_err/np.exp(lnNtot))

rot_err = np.sqrt((lnNugu_0_err)**2 + (np.array(Eu_linspace)*T**(-2)*T_err)**2)

In [49]:
Eu_linspace = np.linspace(min(emission_line_eup)-5, max(emission_line_eup)+5, 1000)

plt.subplot2grid((1,7), (0,0), colspan=5)

plt.errorbar(emission_line_eup, lnnugu, yerr=lnnugu_err, fmt="o", capsize=3, color="black")

plt.ylabel("$\log(N_u/g_u)$", fontsize=20)
plt.xlabel("$E_u/k$", fontsize=20)

#plt.fill_between(x=Eu_linspace, y1=rotdiag_model(Eu_linspace, T, lnNugu_0)-rot_err, y2=rotdiag_model(Eu_linspace, T, lnNugu_0)+rot_err, alpha=0.1, color="gray")

Tchain_burn = out.flatchain["T"][::10]
lnNugu_0chain_burn = out.flatchain["lnNugu_0"][::10]

for i,T in enumerate(Tchain_burn):
    plt.plot(Eu_linspace, rotdiag_model(Eu_linspace, T, lnNugu_0chain_burn[i*10]), color="lime", linestyle="-", linewidth=0.01, alpha=0.5)
    
T_best = np.median(out.flatchain["T"])
lnNugu_0_best = np.median(out.flatchain["lnNugu_0"])


plt.plot(Eu_linspace, rotdiag_model(Eu_linspace, T_best, lnNugu_0_best), color="red", linestyle="--")
        
plt.ylim([14,30])

plt.subplot2grid((1,7), (0,5), colspan=1)
plt.errorbar([0], [logNtot], yerr=logNtot_err , c="black", capsize=3)

plt.fill_between(x=[-1,1],y1=0,y2=logNtot, color="red", alpha=0.6)
plt.title("$\log_{10}(N_{tot})$", fontsize=20)

plt.xticks([])
plt.ylim([0,20])

plt.subplot2grid((1,7), (0,6), colspan=1)
plt.errorbar([0], [T_best], yerr=T_err , c="black", capsize=3)

plt.fill_between(x=[-1,1],y1=0,y2=T_best, color="red", alpha=0.6)
plt.title("$T_{rot}$(K)", fontsize=20)

plt.xticks([])
plt.ylim([0,50])
plt.xticks(fontsize=18)
plt.show()

In [None]:
T_best

In [43]:
emission_line_transition

['CH3CN (4 1 _ 3 1) ',
 'CH3CN (4 0 _ 3 0) ',
 'CH3CN (5 1 _ 4 1) ',
 'CH3CN (5 0 _ 4 0) ',
 'CH3CN (6 1 _ 5 1) ',
 'CH3CN (6 0 _ 5 0) ']

In [38]:
emission_line_frequencies

[73588.7747, 73590.3372, 91985.2624, 91987.0202, 110381.359, 110383.508]

In [42]:
print(freq_to_vel(np.array(emission_line_frequencies))/1000)

[ 64474.74249435  64469.74586659   5645.68571428   5640.06454802
 -53182.12038987 -53188.99255183]


In [44]:
print(emission_line_eup)

[15.975, 8.829, 20.39, 13.244000000000002, 25.686999999999998, 18.542]


In [46]:
molecule_frequencies

0      73577.451
1      73584.543
2      73588.799
3      73590.218
4      91958.726
5      91971.130
6      91979.994
7      91985.314
8      91987.088
9     110349.470
10    110364.354
11    110374.989
12    110381.372
13    110383.500
Name: Frequency(Ghz), dtype: float64

In [47]:
lnnugu

array([21.39820741, 21.50550275, 20.81804032, 20.94366705, 20.10042412,
       19.79503676])

In [51]:
logNtot_err

0.1552091478833181