### Data Loading and Web Scraping

In [2]:
#------------------------------------------------------------------------------------
#IMPORT THE REQUIRED LIBRARIES
#------------------------------------------------------------------------------------
import pandas as pd
import warnings
import numpy as np
import requests
from bs4 import BeautifulSoup
import csv
import re
import json
import timeit

warnings.filterwarnings("ignore")

#-------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------
#DATA LOADING AND CLEANING
#-------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------------

#Downloaded on 22/10/2020 a las 9:20
df = pd.read_csv("alerts22102020_920.csv")

#Unification of the names of each columns by removing unnecessary spaces and characters
df.columns = df.columns.str.lstrip().str.replace('#', '')


#-------------------------------------------------------------------------------------
#CLASS 
#Extraction of the different classes provided by the database
df["Class"].unique()
class_freq=df.groupby("Class").agg(frequency=("Class", "count")).sort_values("frequency", ascending=False)
#class_freq[class_freq["frequency"]>20]

#-------------------------------------------------------------------------------------
#COMMENT
#Unification of comments with equal meaning
for i in range(len(df["Comment"])):
    df["Comment"][i]=str(df["Comment"][i]).replace("hostless blue transient", "blue hostless transient").replace("hostless, blue transient", "blue hostless transient")
    df["Comment"][i]=str(df["Comment"][i]).replace("confirmed SNIa","confirmed SN Ia")
    df["Comment"][i]=str(df["Comment"][i]).replace("Candidate SN","candidate SN")
    df["Comment"][i]=str(df["Comment"][i]).replace("candidate CV, blue hostless transient","blue hostless transient, candidate CV")

#Extraction of the different comments present in the database as well as the frequency of each one.
com_freq=df.groupby("Comment").agg(frequency=("Comment", "count")).sort_values("frequency", ascending=False)
#com_freq[com_freq["frequency"]>20]

#-------------------------------------------------------------------------------------
#NEW COLUMN "CLASS_COMMENT"
#Creation of a new column "Class_comment" where the class of the alert will appear as long as it exists,
#when no class is available, this gap will be filled with the comment.

clcom=[]
feature=[]
for i in range(df.shape[0]):
    if df["Class"][i]=="unknown":
        feature.append(df["Comment"][i])
        clcom.append("Comment")
    else:
        feature.append(df["Class"][i])
        clcom.append("Class")
df["Feature"]=feature
df["Class_Comment"]=clcom

#Extraction of the different "Class_comments" present in the database as well as the frequency of each one.
clcom_freq=df.groupby("Feature").agg(frequency=("Feature", "count")).sort_values("frequency", ascending=False)
#clcom_freq[clcom_freq["frequency"]>20]

#-------------------------------------------------------------------------------------
#RESTRUCTURING THE DATABASE
#Filter according to frequency
c, n = np.unique(df["Feature"], return_counts=True)
name, num=[c[i] for i in range(len(c)) if (n[i]>50 and c[i]!="nan")], [n[i] for i in range(len(n)) if (n[i]>50 and c[i]!="nan")]
#pd.DataFrame([name,num])

#Construction of the dataset
df_classifier= df[df.Feature.isin(name)]
df_classifier = df_classifier.reset_index(drop=True)
df_classifier.to_csv('less50.csv')
df_classifier.head()


Unnamed: 0,Name,Date,RaDeg,DecDeg,AlertMag,HistoricMag,HistoricStdDev,Class,Published,Comment,TNSid,Feature,Class_Comment
0,Gaia20ewl,2020-10-17 18:25:40,93.82966,-29.31926,18.38,,,SN Ia,2020-10-21 14:09:22,confirmed SN Ia,SN2020uyg,SN Ia,Class
1,Gaia20ewk,2020-10-16 23:38:06,128.63759,-1.95792,18.53,,,SN Ia,2020-10-21 14:08:44,confirmed SN Ia,SN2020tnq,SN Ia,Class
2,Gaia20evz,2020-10-20 06:34:41,173.78373,22.45434,17.54,17.79,0.04,QSO,2020-10-21 13:05:45,~0.3 mag rise in known QSO,AT2020xtf,QSO,Class
3,Gaia20evy,2020-10-19 09:48:10,244.86595,42.51098,16.8,18.99,0.96,CV,2020-10-21 13:04:08,known CV ASASSN-14fl in 3 mag outburst,AT2020xte,CV,Class
4,Gaia20evx,2020-10-19 16:52:24,169.27603,20.23541,16.33,17.49,0.16,BL Lac,2020-10-21 13:03:26,known BL Lac brightens by 1 mag in 5 months,AT2020xtd,BL Lac,Class


In [2]:
#------------------------------------------------------------------------------------
#ORDER ALERT FUNCTION
#------------------------------------------------------------------------------------
#This function will return the position of the spectrum corresponding to the alert.

def alert_detection(name):
    
    #Data load
    page = requests.get('http://gsaweb.ast.cam.ac.uk/alerts/alert/'+ name).text 
    soup = BeautifulSoup(page, "html.parser")
    
    #-----------------------------------------------------------------------------------------
    #Extraction of the magnitude and date of the alert
    #It is important to extract both because in certain cases the magnitude is repeated
    
    #Import all dl files
    dl = soup.find_all('dl',class_="dl-left")
    gross = str(dl)
    clean = gross.replace("<dd>","").replace("</dd>","").replace("<dt>","").replace("</dt>","")
    divide = re.split("\n", clean)
    search_alert_date = [re.search("Alerting date", i) for i in divide]
    search_alert_mag = [re.search("Alerting magnitude", i) for i in divide]
    #Fecha y magnitud de la alerta
    value_alert_date = divide[np.where(search_alert_date)[0][0]+1]
    value_alert_mag = divide[np.where(search_alert_mag)[0][0]+1]

    #-----------------------------------------------------------------------------------------
    #For the position of the alert we compare the values obtained with those shown in the spectra
    #Extraction of information from the spectra
    tr = soup.find_all('tr', class_ = "spectrum" )
    #Cleaning
    divide_tr = [re.split("\n", i.text) for i in tr]
    #Search for the right result and save the position
    j = 0
    for i in range(len(divide_tr)):
        if (divide_tr[i][1] == value_alert_date) & (divide_tr[i][3] == value_alert_mag):
            order_alert = i
            j+=1
    if j>0:
        return order_alert
    else:
        return "None"

In [3]:
#-----------------------------------------------------------------------------------
#SPECTRUM EXTRACTION
#-----------------------------------------------------------------------------------


element_url=df_classifier["Name"]

def extract_spectrums(data):
    #Beginning
    tic = timeit.default_timer()
    #Creation of the dataframe
    df_spectrum=pd.DataFrame()
    #Loop over each of the "Name" elements
    #df_classifier.shape[0]
    for k in range(data.shape[0]):
        url_page = 'http://gsaweb.ast.cam.ac.uk/alerts/alert/'+ data["Name"][k]

        #-------------------------------------------------------------
        #Extraction of Var_spectra on each of the alerts
        page = requests.get(url_page).text 
        soup = BeautifulSoup(page, "html.parser")
        script = soup.find_all('script') 
        search_script=[re.search("var spectra", i.text) for i in script]
        var_spectra = script[np.where(search_script)[0][0]].text.replace("\n\tvar spectra = ", "").replace(";\n", "")
        d = json.loads(var_spectra)
        #-------------------------------------------------------------

        #Empty lists where we will keep the items
        order=[]
        bp=[]
        rp=[]
        a_d=[]
        order_alert = alert_detection(data["Name"][k])

        for i in range(len(d)):
            j=[value for value in dict(d[i]).values()]
            if j[0] == order_alert:
                a_d.append("A")
            else:
                a_d.append("D")

            order.append(j[0])
            bp.append(j[1])
            rp.append(j[2])


        spectrum= {"id": data["Name"][k],
            "order": order , 
            "bp": bp , 
            "rp": rp,
            "a_d": a_d,
            "feature": data["Feature"][k]
            } 

        #Creation of the dataframe of a given alert
        df_element = pd.DataFrame(spectrum)
        #Gather the dataframe
        df_spectrum = pd.concat([df_spectrum, df_element])

        #-------------------------------------------------------------
        #Counter
        if  k % 50 == 0:
            print("Link {}".format(k))
        #-------------------------------------------------------------

    df_spectrum = df_spectrum.reset_index(drop=True)
    toc = timeit.default_timer()
    print ("Computation time = " + str((toc - tic)) + "s")
    return df_spectrum 

#df_spectrum = extract_spectrums(df_classifier)
#Guardamos los datos en un csv.
#df_spectrum.to_csv('Spectrums.csv')

In [5]:
#We checked the existence of elements that do not appear in the spectral dataset
df_spect = pd.read_csv("Spectrums.csv")

j=0
for i in range(df_classifier.shape[0]):
    if df_classifier["Name"][i] not in list(df_spect["id"]):
        j+=1
        print(df_classifier["Name"][i])
print(j)

Gaia20eqo
Gaia20dwa
Gaia19fkd
Gaia19erx
Gaia19emu
Gaia19ddq
Gaia19ckq
Gaia18cxm
Gaia18buw
Gaia18bto
Gaia16adk
11
