# globe at night data processing

The purpose of this notebook is to bring all globe at night csvs into a single dataframe.
We want them in a single dataframe so that a predictive model an be trained on it.


In [23]:
import math
from pathlib import Path

import numpy as np
import pandas as pd

cwd = Path.cwd()
data_path = cwd / "data" / "globe_at_night"
dataframes = [pd.read_csv(p, on_bad_lines="skip") for p in data_path.glob("*.csv")]
df = pd.concat(dataframes, ignore_index=True)
df = df.drop(columns=["ID", "ObsID", "LocalDate", "LocalTime", "Constellation"])
df = df.dropna(subset=["SQMReading", "CloudCover", "Elevation(m)"], how="any", axis=0)
df = df.reset_index()

SQM_OBS_TYPE = "SQM"
MAX_SQM = 22
MIN_SQM = 16
df = df[df["ObsType"] == SQM_OBS_TYPE]
df = df[df["SQMReading"] <= MAX_SQM]
df = df[df["SQMReading"] >= MIN_SQM]
df["UTTime"] = pd.to_datetime(df["UTTime"], format="%H:%M")
df["UTTimeHour"] = np.sin(2 * np.pi * df["UTTime"].dt.hour / 24)
df.info

<bound method DataFrame.info of         index ObsType  Latitude  Longitude  Elevation(m)      UTDate  \
0          39     SQM   34.2365 -110.08400     1964.3800  2019-01-02   
1          41     SQM   33.3369 -111.42500      561.7730  2019-01-02   
4         110     SQM   38.8878 -119.82000     1466.4800  2019-01-03   
5         130     SQM   45.7688    1.05404      333.9360  2019-01-02   
8         170     SQM   33.5127 -112.45900      347.5930  2019-01-04   
...       ...     ...       ...        ...           ...         ...   
41976  265866     SQM   53.7430   -1.58675      113.4350  2017-12-18   
41977  265886     SQM   38.8878 -119.82000     1466.4800  2017-12-19   
41978  265893     SQM   37.8585 -122.14400      345.8890  2017-12-21   
41979  265901     SQM   47.6102   20.72810       91.7003  2017-12-18   
41980  265902     SQM   47.6102   20.72810       91.7585  2017-11-15   

                   UTTime  LimitingMag  SQMReading SQMSerial       CloudCover  \
0     1900-01-01 02:28

In [24]:
def get_oktas_from_description(description: str) -> int:
    match description:
        case "0" | "clear":
            return 0
        case "25" | "1/4 of sky":
            return 2
        case "50" | "1/2 of sky":
            return 4
        case "75" | "over 1/2 of sky":
            return 6
        case _:
            return 8


df["CloudCover"] = df["CloudCover"].map(get_oktas_from_description)

In [25]:
output_file_path = Path.cwd() / "data" / "gan.csv"
df.to_csv(output_file_path, index=False)

In [26]:
correlations = {}
for column in df.columns:
    try:
        df[column] = pd.to_numeric(df[column], errors="coerce")
        if df[column].dtype == "float64" or df[column].dtype == "int64":
            correlation = df["SQMReading"].corr(df[column])
            if not math.isnan(correlation):
                correlations[column] = correlation
    except ValueError:
        pass
correlations

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


{'index': -0.021529680658276776,
 'Latitude': -0.038435703339897494,
 'Longitude': 0.09874163511112802,
 'Elevation(m)': 0.25910230694150344,
 'UTTime': 0.053863371584591695,
 'LimitingMag': 0.07848802762062192,
 'SQMReading': 1.0,
 'SQMSerial': 0.0016856163982963998,
 'CloudCover': -0.2456437264534707,
 'LocationComment': 1.0,
 'UTTimeHour': 0.09087558395241287}