In [18]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from scipy.stats import lognorm, norm
from common import SCFH_TO_SLPM_FACTOR, plot_histogram, plot_cdf
from config import CUSTOMER

WSU_MU_NORM = -1.36
WSU_SIGMA_NORM = 1.77
ITALGAS_EMISSION_FACTORS = {
    "B-2": 0.2,
    "B-1": 0.5,
    "B0": 1.9,
    "B1": 9.1,
}  # Italgas & Toscana, https://picarro.atlassian.net/wiki/spaces/DAKB/pages/2291859477/Emission+Factor+Lookup+Tables

In [19]:
ITALGAS_EMISSION_FACTORS

{'B-2': 0.2, 'B-1': 0.5, 'B0': 1.9, 'B1': 9.1}

In [16]:
bins_dictionary = { 'UNARETI' : {
    "B-2": 0.09,
    "B-1": 0.5,
    "B0": 2.2,
    "B1": 10,},

    'PSG': {
    "B-2": 0.09,
    "B-1": 0.5,
    "B0": 2.2,
    "B1": 10,},

    'APRETIGAS': {
    "B-2": 0.09,
    "B-1": 0.5,
    "B0": 2.2,
    "B1": 10, },


}
bins = bins_dictionary.get('UNARETI')
bins



{'B-2': 0.09, 'B-1': 0.5, 'B0': 2.2, 'B1': 10}

## Get Natural Gas or Possible Natural Gas LISAs

In [None]:
emission_sources = pd.read_pickle(
    f"data/prepared-leaks-with-emission-sources-{CUSTOMER}-errors.pickle"
)
emission_sources.info()
emission_sources.head()

## Fit Log-Normal Curve to Measured Emissions

In [None]:
shape_measured, _, scale_measured = lognorm.fit(emission_sources.MeasuredSCFH, floc=0)

mu_measured_norm = np.log(scale_measured)
sigma_measured_norm = shape_measured

mu_measured_norm, sigma_measured_norm

In [None]:
ax, bins = plot_histogram(
    x=np.log(emission_sources.MeasuredSCFH),
    title=f"{CUSTOMER} Measured Emissions",
    x_label="$ln$(emission rate)",
    n_bins="sqrt",
    label=f"{CUSTOMER} $ln$(measurements)",
)

y_fit = norm.pdf(bins, loc=mu_measured_norm, scale=sigma_measured_norm)

ax.plot(
    bins,
    y_fit,
    linewidth=2,
    label=f"{CUSTOMER}"
    + r" normal: $\mu={:0.2f}$; $\sigma={:0.2f}$".format(
        mu_measured_norm, sigma_measured_norm
    ),
)

ax.plot(
    bins,
    norm.pdf(bins, loc=WSU_MU_NORM, scale=WSU_SIGMA_NORM),
    linewidth=2,
    label=r"WSU normal: $\mu={:0.2f}$; $\sigma={:0.2f}$".format(
        WSU_MU_NORM, WSU_SIGMA_NORM
    ),
)

ax.legend()

In [None]:
ax, bins = plot_cdf(
    x=emission_sources.MeasuredSCFH,
    title=f"{CUSTOMER} Measured Emissions",
    x_label=r"emission rate ($\frac{ft^3}{hr}$)",
    n_bins=100000,
    log_x=True,
    log_y=False,
    label=f"{CUSTOMER} measurements",
    color=None,
)

y_fit = lognorm.cdf(bins, s=sigma_measured_norm, scale=np.exp(mu_measured_norm))

ax.plot(
    bins,
    y_fit,
    linewidth=2,
    label=f"{CUSTOMER}"
    + r" log-normal: $\mu={:0.2f}$; $\sigma={:0.2f}$".format(
        mu_measured_norm, sigma_measured_norm
    ),
)

ax.plot(
    bins,
    lognorm.cdf(bins, s=WSU_SIGMA_NORM, scale=np.exp(WSU_MU_NORM)),
    linewidth=2,
    label=r"WSU log-normal: $\mu={:0.2f}$; $\sigma={:0.2f}$".format(
        WSU_MU_NORM, WSU_SIGMA_NORM
    ),
)
ax.legend()

## Assign Emission Factors

In [None]:
emission_sources = (
    emission_sources.assign(
        Bin=pd.cut(
            x=emission_sources.MeasuredSCFH,
            bins=[0, 0.1, 1, 10, np.inf],
            labels=["B-2", "B-1", "B0", "B1"],
        )
    )
    .assign(EmissionFactorInSCFH=lambda x: x.Bin.map(ITALGAS_EMISSION_FACTORS))
    .astype(
        {
            "Bin": str,
            "EmissionFactorInSCFH": float,
        }
    )
)

## Add Liters Per Minute Columns

In [None]:
emission_sources["EmissionFactorInLPM"] = (
    emission_sources["EmissionFactorInSCFH"] * SCFH_TO_SLPM_FACTOR
)

emission_sources[["EmissionFactorInLPM"]].value_counts()

## Set True and False conditions on the priority score column
(meeting on the 12/04/2022)
- < 0.06 True
- => 0.06 False


In [None]:
emission_sources["PriorityScore"] = emission_sources["PriorityScore"] < 0.06
emission_sources.info()
emission_sources.head()

## Customize Columns for Customer

In [None]:
emission_sources["MeasuredSLPM"] = (
    emission_sources["MeasuredSCFH"] * SCFH_TO_SLPM_FACTOR
)

In [None]:
emission_sources["EmissionFactorTimesLeakProbabilityLPM"] = (
    emission_sources["LeakProbability"] * emission_sources["EmissionFactorInLPM"]
)

In [None]:
emission_sources["ReportName"] = emission_sources["ReportId"].map(
    lambda x: "CR-" + x[:6].upper()
)

In [None]:
emission_sources["LISANumber"] = (
    emission_sources["ReportName"] + "-" + emission_sources["PeakNumber"].astype(str)
)

In [None]:
emission_sources["AssetLengthCoveredKM"] = (
    emission_sources["PipelineMeters"].astype(float)
    * emission_sources["AssetCoverageFrac"].astype(float)
    / 1000
)

In [None]:
COLUMN_RENAMES = {
    "ReportId": "pcubedreportguid",
    "ReportName": "pcubedreportname",
    "ReportTitle": "pcubedreportitle",
    "DateReportStarted": "pcubedreportdate",
    "PipelineMeters": "PipelineMeters".lower(),
    "AssetLengthCoveredKM": "km_in_fov",
    # "IsFiltered": "BelowRRA",
    "PriorityScore": "BelowRRA",
    "LeakProbability": "LeakProbability".lower(),
    "BoxId": "BoxId".lower(),
    # "LeakGrade": "LeakGrade".lower(),
    "codiceDispersione": "LeakGrade".lower(),
    # "FoundDateTime": "FoundDateTime".lower(),
    "dataLocalizzazione": "FoundDateTime".lower(),
    # "AG/BG": "agbg",
    "aereoInterrato": "agbg",
    "LeakFound": "LeakFound".lower(),
    # "LeakLocation": "LeakLocation".lower(),
    "indirizzoLocalizzazione": "LeakLocation".lower(),
    "LeakLatitude": "LeakLatitude".lower(),
    "LeakLongitude": "LeakLongitude".lower(),
    "MeasuredSCFH": "emissionrate_measured_scfh",
    "MeasuredSLPM": "emissionrate_measured_lpm",
    "Bin": "emission_bin",
    "EmissionFactorInLPM": "emission_factor_lpm",
    "EmissionFactorTimesLeakProbabilityLPM": "emissionfactor_leakprob_lpm",
    "City": "City".lower(),
    "Region": "Region".lower(),
}

emission_sources = emission_sources.rename(columns=COLUMN_RENAMES)

In [None]:
column_order = [
    "pcubedreportguid",
    "region",
    "city",
    "pcubedreportname",
    "pcubedreportitle",
    "pcubedreportdate",
    "PipelineMeters".lower(),
    "AssetCoverageFrac",
    "km_in_fov",
    "EmissionSourceId",
    "CH4",
    "MaxAmplitude",
    "EthaneRatio",
    "EthaneRatioUncertainty",
    "Disposition",
    "ClassificationConfidence",
    "LISANumber",
    "BelowRRA",
    "GpsLatitude",
    "GpsLongitude",
    "LeakProbability".lower(),
    "BoxId".lower(),
    "LeakGrade".lower(),
    "FoundDateTime".lower(),
    "agbg",
    "LeakFound".lower(),
    "LeakLocation".lower(),
    "LeakLatitude".lower(),
    "LeakLongitude".lower(),
    "emissionrate_measured_scfh",
    "emissionrate_measured_lpm",
    "emission_bin",
    "emission_factor_lpm",
    "emissionfactor_leakprob_lpm",
]

emission_sources = emission_sources[column_order]
emission_sources.info()
emission_sources.head()

## Save for Customer

In [None]:
emission_sources.to_csv(
    f"data/leaks-with-emission-factors-{CUSTOMER}_report_CR-763DCC_CR-2AE499_.csv",
    index=False,
)

# For June 2022

In [None]:
emission_sources_big = pd.read_csv(
    "data/leaks-with-emission-factors-italgas_until_June_29.csv"
)

In [None]:
emission_sources_big.info()

In [None]:
frames = [emission_sources_big, emission_sources]
emission_sources_final = pd.concat(frames)
emission_sources_final.to_csv(
    f"data/leaks-with-emission-factors-{CUSTOMER}_report_until_June_702_reports.csv",
    index=False,
)