In [1]:
import pandas as pd

## Connect to PG Database

In [2]:
import psycopg2
import os
from dotenv import load_dotenv

load_dotenv("../postgres/.env")

True

In [3]:
PG_USER = os.getenv('POSTGRES_USER')
PG_PW = os.getenv('POSTGRES_PASSWORD')
PG_DB = os.getenv('POSTGRES_DB')

In [4]:
postgres_conn_args = {
    "host": "localhost",
    "database": PG_DB,
    "user": PG_USER,
    "password": PG_PW,
    "port": 5432
}

In [5]:
pg_conn = psycopg2.connect(**postgres_conn_args)

In [6]:
table_reference = "blogs_dev_staging.stg_reddit__zero_shot"
n = 3
exclude_columns = "id,post_utc_date_created_at,zero_shot_text"
text_column = "zero_shot_text"
date_column = "post_utc_date_created_at"
top_n_table = "blogs_dev_staging.stg_reddit__transpose_zero_shot"
is_incremental = False
incremental_interval = '3 day'

In [7]:
import pandas as pd

top_n_columns = [f"zero_shot_feature_{idx}" for idx in range(n)]
exclude_columns_list = exclude_columns.split(",")

incremental_query = (
    f"""
WHERE {date_column} >= (
SELECT
    max({date_column}) - interval '{incremental_interval}'
FROM {top_n_table}
)
"""
    if is_incremental
    else ""
)

Q = f"""
SELECT *
FROM
    {table_reference}
{incremental_query}
"""

In [8]:
print(Q)


SELECT *
FROM
    blogs_dev_staging.stg_reddit__zero_shot




In [9]:
exclude_columns_list

['id', 'post_utc_date_created_at', 'zero_shot_text']

In [10]:
with pg_conn:
    df = pd.read_sql(Q, pg_conn)

In [11]:
if df.empty:
    exclude_columns_list.remove(text_column)
    df_results = pd.DataFrame(columns=(exclude_columns_list + top_n_columns))

else:
    df_filtered = df[df.columns.drop(exclude_columns_list)]
    df_filtered = df_filtered.apply(pd.to_numeric)
    df_t=pd.DataFrame(df_filtered).T

    df_top_n = pd.DataFrame(columns=top_n_columns)
    for i in df_t.columns:
        df_row = pd.DataFrame(df_t.nlargest(n, i).index.tolist(), index=top_n_columns).T
        df_top_n = pd.concat([df_top_n, df_row], axis=0)
    df_top_n.reset_index(drop=True, inplace=True)

    df_results = df.join(df_top_n)
    exclude_columns_list.remove(text_column)
    df_results = df_results[(exclude_columns_list + top_n_columns)]

In [15]:
df_t

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
career,0.534455,0.141005,0.610036,0.695442,0.564078,0.144809,0.509253,0.734711,0.329324,0.876295,...,0.312362,0.326311,0.072127,0.924495,0.116482,0.769924,0.101979,0.103798,0.962993,0.878054
ci_cd,0.100304,0.054991,0.317547,0.004628,0.000427,0.066024,0.006089,0.066514,0.100293,0.193207,...,0.436179,0.139757,0.02591,0.027221,0.127119,0.58458,0.038263,0.02313,0.248705,0.182176
container_orchestration,0.179832,0.046972,0.132781,0.002924,0.035447,0.78649,0.010628,0.053848,0.064657,0.045508,...,0.052265,0.223687,0.055157,0.005659,0.026263,0.048808,0.097063,0.048047,0.080922,0.045917
data_ingestion,0.257356,0.497458,0.42874,0.16756,0.030053,0.070273,0.068305,0.310522,0.4705,0.387505,...,0.339755,0.59016,0.144416,0.426798,0.556502,0.613156,0.355754,0.376484,0.207497,0.125505
data_lake,0.131036,0.089872,0.297882,0.014979,0.018639,0.059652,0.095155,0.234905,0.305511,0.131129,...,0.107749,0.285738,0.075298,0.119592,0.992924,0.208874,0.392662,0.146339,0.235999,0.162288
data_lakehouse,0.285507,0.144968,0.173472,0.021803,0.202316,0.253639,0.238004,0.205466,0.292468,0.229841,...,0.036662,0.238195,0.000643,0.019204,0.095303,0.384115,0.403455,0.37564,0.315218,0.145944
data_mesh,0.515945,0.177162,0.254806,0.092731,0.030059,0.154191,0.180866,0.235982,0.277067,0.222526,...,0.125807,0.315908,0.110129,0.352854,0.249413,0.192265,0.483176,0.226111,0.311319,0.148986
data_orchestration,0.54458,0.788386,0.519185,0.347691,0.089702,0.625179,0.062305,0.251255,0.412039,0.29009,...,0.727814,0.591882,0.437134,0.516862,0.915069,0.249599,0.41967,0.347103,0.23503,0.100657
data_transform,0.237736,0.647944,0.496813,0.077574,0.056414,0.124693,0.143855,0.299283,0.310724,0.372212,...,0.211523,0.43518,0.139495,0.154322,0.215883,0.306279,0.420478,0.29839,0.320939,0.191071
data_warehouse,0.219351,0.700686,0.445353,0.015754,0.01646,0.098559,0.054419,0.197366,0.266008,0.268899,...,0.177575,0.490567,0.262649,0.161543,0.706747,0.228156,0.19336,0.116606,0.201492,0.106329


In [13]:
df_filtered

Unnamed: 0,career,ci_cd,container_orchestration,data_ingestion,data_lake,data_lakehouse,data_mesh,data_orchestration,data_transform,data_warehouse,etl_jobs,learning,scheduling,streaming
0,0.534455,0.100304,0.179832,0.257356,0.131036,0.285507,0.515945,0.54458,0.237736,0.219351,0.069878,0.328071,0.040957,0.066578
1,0.141005,0.054991,0.046972,0.497458,0.089872,0.144968,0.177162,0.788386,0.647944,0.700686,0.048917,0.97904,0.158083,0.155553
2,0.610036,0.317547,0.132781,0.42874,0.297882,0.173472,0.254806,0.519185,0.496813,0.445353,0.219436,0.808945,0.584255,0.510787
3,0.695442,0.004628,0.002924,0.16756,0.014979,0.021803,0.092731,0.347691,0.077574,0.015754,0.017437,0.938391,0.089701,0.077023
4,0.564078,0.000427,0.035447,0.030053,0.018639,0.202316,0.030059,0.089702,0.056414,0.01646,0.027749,0.911352,0.103728,0.388229
5,0.144809,0.066024,0.78649,0.070273,0.059652,0.253639,0.154191,0.625179,0.124693,0.098559,0.048577,0.399778,0.197562,0.741686
6,0.509253,0.006089,0.010628,0.068305,0.095155,0.238004,0.180866,0.062305,0.143855,0.054419,0.034067,0.901122,0.047473,0.168244
7,0.734711,0.066514,0.053848,0.310522,0.234905,0.205466,0.235982,0.251255,0.299283,0.197366,0.076045,0.799596,0.256644,0.294716
8,0.329324,0.100293,0.064657,0.4705,0.305511,0.292468,0.277067,0.412039,0.310724,0.266008,0.130638,0.868781,0.226601,0.307237
9,0.876295,0.193207,0.045508,0.387505,0.131129,0.229841,0.222526,0.29009,0.372212,0.268899,0.185267,0.985058,0.194271,0.678592
