In [39]:
import pandas as pd
from utils_steo.find_last_update import updatecompiler
from utils_steo.create_more_files import main as create_more_files

def get_last_release():
    df = updatecompiler()
    end_date = df['new_release_date'][0].replace(day=1)  
    return end_date

def get_download_list(offset_month):
    end_month = get_last_release()
    start_month = end_month - pd.DateOffset(months=offset_month-1)

    months = pd.date_range(start_month,end_month,freq='MS')
    df = pd.DataFrame(months,columns=['date'])
    #df['year'] last two digits of year
    df['year'] = df['date'].dt.strftime('%y').astype(str)
    df['monthStr'] = df['date'].dt.strftime('%b').str.lower()
    df['dates'] = df['monthStr'] + df['year']
    df['url'] = 'https://www.eia.gov/outlooks/steo/archives/' + df['dates'] + '_base.xlsx'
    df = df.sort_values(by='date',ascending=False).reset_index(drop=True)
    df = df.rename(columns={'date':'release_date'})
    return df


class steo:
    def __init__(self):
        self.meta = pd.read_csv("lookup/metadata_steo.csv")

    def _get_file(self, pathname):
        xlsx = pd.read_excel(pathname, sheet_name=None, header=2)
        del xlsx["Dates"]
        del xlsx["Contents"]
        df = pd.concat(xlsx, ignore_index=True)
        return df

    def _clean_data(self, df):

        df = df.rename(columns={"Forecast date:": "id", "Unnamed: 1": "name"})
        df = df.dropna(subset=["id"])

        if df.columns[-1] == 'Unnamed: 0':
            df = df.drop(df.columns[-1], axis=1)

        col = df.columns.tolist()
        col = [str(i) for i in col]
        for i in range(len(col)):
            if "Unnamed" in col[i]:
                col[i] = col[i - 1]
        col = col[2:]
        col = [str(i) for i in col]

        frow = df.iloc[0].tolist()
        frow = frow[2:]

        combined = [str(x) + "-" + str(y) + "-01" for x, y in zip(col, frow)]
        combined = [pd.to_datetime(i, format="%Y-%b-%d") for i in combined]
        combined = [i.strftime("%Y-%m-%d") for i in combined]
        combined = ["id", "name"] + combined

        df.columns = combined

        df.drop(df[df["id"].str.contains("Table of Contents")].index, inplace=True)
        df.drop(df[df["id"].str.contains(df.iloc[0, 0])].index, inplace=True)
        df.drop(df.columns[1], axis=1, inplace=True)
        df.drop_duplicates(inplace=True)

        return df

    def _unpivot_data(self, df):
        df = pd.melt(
            df,
            id_vars="id",
            value_vars=df.columns[1:],
            var_name="period",
            value_name="value",
        )
        df["value"] = df["value"].fillna(0)
        df["value"] = df["value"].apply(pd.to_numeric, errors="coerce")

        return df

    def _add_meta(self, df):
        df = pd.merge(df, self.meta, left_on="id", right_on="id", how="left")
        # move the value column to the last column
        df = df[["id", "name", "uom", "period", "value", "release_date"]]
        return df

    def get_data(self, pathname):
        df = self._get_file(pathname)
        df = self._clean_data(df)
        df = self._unpivot_data(df)
        return df

    def get_all(self,offset_months=3):

        # df = self.get_data('https://www.eia.gov/outlooks/steo/archives/jan24_base.xlsx')

        dfDates = get_download_list(offset_months)

        df = pd.DataFrame()
        for i in range(len(dfDates)):
            pathname = "https://www.eia.gov/outlooks/steo/archives/{}_base.xlsx".format(
                dfDates["dates"][i]
            )
            print(f"downloading: {pathname}")
            try:
                df2 = self.get_data(pathname)
                df2["release_date"] = dfDates["release_date"][i]
                df = pd.concat([df, df2], ignore_index=True)
            except:
                print(f"error: {pathname}")
                break
        df["id"] = df["id"].str.upper()
        df = self._add_meta(df)
        df = df.drop_duplicates()
        df = df.dropna(subset=['value'])
        df = df.dropna(subset=['name'])
        
        df = pd.pivot_table(df, index=['id','name','release_date','uom'], columns='period', values='value').reset_index()
        df.to_feather('steo_pivot.feather')        
        
        create_more_files()
        
        return df

def main(offset_months=3):
    df = steo().get_all(offset_months)
    return df

def read_pivot():
    df = pd.read_feather('./data/steo_pivot.feather')
    return df

def melt_pivot():
    df = pd.read_pivot()
    df = pd.melt(df, id_vars=['id','name','uom','release_date'], var_name='period', value_name='value')
    
    cols = df.columns
    meta = ['id','name','release_date']
    cols = [i for i in cols if i not in meta]

    df = pd.melt(df, id_vars=meta, value_vars=cols, var_name='period', value_name='value')
    return df

if __name__ == "__main__":
    df = main()

downloading: https://www.eia.gov/outlooks/steo/archives/oct24_base.xlsx
downloading: https://www.eia.gov/outlooks/steo/archives/sep24_base.xlsx
downloading: https://www.eia.gov/outlooks/steo/archives/aug24_base.xlsx


In [38]:
from utils_steo.calcs import melt_pivot

import pandas as pd
df = pd.read_feather('data/steo_pivot_dpr.feather')
meta_name = df[['id', 'name']].drop_duplicates().set_index('id').to_dict()['name']
# meta_uom = df[['id', 'uom']].drop_duplicates().set_index('id').to_dict()['uom']
df

period,id,name,release_date,2020-01-01,2020-02-01,2020-03-01,2020-04-01,2020-05-01,2020-06-01,2020-07-01,...,2025-03-01,2025-04-01,2025-05-01,2025-06-01,2025-07-01,2025-08-01,2025-09-01,2025-10-01,2025-11-01,2025-12-01
233,COEOPAP,"appalachia existing oil production change, one...",2024-08-01,-13.645884,-13.418639,-12.669610,-11.889344,-10.487761,-9.651850,-10.008547,...,,,,,,,,,,
234,COEOPAP,"appalachia existing oil production change, one...",2024-09-01,-13.645884,-13.418639,-12.669610,-11.889344,-10.487761,-9.651850,-10.008547,...,,,,,,,,,,
235,COEOPAP,"appalachia existing oil production change, one...",2024-10-01,-13.792100,-13.639410,-12.894905,-12.103376,-10.590656,-9.670815,-9.983088,...,,,,,,,,,,
236,COEOPBK,"bakken existing oil production change, one-yea...",2024-08-01,-117.321745,-133.252086,-131.136679,-130.555741,-88.723486,-72.725975,-49.891286,...,,,,,,,,,,
237,COEOPBK,"bakken existing oil production change, one-yea...",2024-09-01,-117.321745,-133.252086,-131.136679,-130.555741,-88.723486,-72.725975,-49.891286,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2250,TOPREF,tight oil production from eagle ford formation,2024-09-01,1.263718,1.254840,1.252706,1.163288,0.826643,0.897864,0.990711,...,,,,,,,,,,
2251,TOPREF,tight oil production from eagle ford formation,2024-10-01,1.264000,1.255000,1.253000,1.163000,0.827000,0.898000,0.991000,...,,,,,,,,,,
2261,TOPRPM,tight oil production from permian formations,2024-08-01,4.082302,4.069741,4.169507,3.941392,3.394778,3.668893,3.713521,...,,,,,,,,,,
2262,TOPRPM,tight oil production from permian formations,2024-09-01,4.080368,4.067173,4.167300,3.940212,3.393467,3.668833,3.714592,...,,,,,,,,,,
