# Import lib

In [10]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append(os.path.abspath(os.path.join('../../')))

In [None]:
from src.etl.bronze.extract.data_structure_extract_strategy import DataExtractor
import src.analysis.EDA.data_inspection_strategy as BaseInspector
import src.etl.transform.data_cleansing_strategy as BaseCleansing
import src.etl.transform.data_transform_strategy as BaseTransform
import src.etl.transform.data_combining_strategy as BaseCombining
import src.analysis.visualization.data_univariate_visualization_strategy as BaseViz
import src.etl.transform.data_encoding_strategy as BaseEncoder

In [11]:
with DataExtractor.get_extractor(cfg={
    "type": "db",
    "conn_str": os.environ.get("Connection_String"),
    "schema":"bronze",
    "table": "source_news_articles_currentsapi",
}) as Extractor:
    df_mas_news = Extractor.extractor()



In [12]:
df_mas_news

Unnamed: 0,id,title,description,url,author,image,language,category,published,createdate,usercreate,updatedate,userupdate,activedata,batch_id,source_system,layer,region,source
0,19df6eb7-b237-48e9-88fb-05e521140c1f,Kim Jong Un shows off 'most powerful' ballisti...,North Korean leader Kim Jong Un displayed a ne...,https://www.foxnews.com/world/kim-jong-un-show...,Fox News,https://a57.foxnews.com/static.foxnews.com/fox...,en,['general'],2025-10-11 02:26:15 +0000,2025-10-11 06:00:14.696096,system,NaT,,True,scheduled__2025-10-10T06:00:00+00:00,currentsapi,bronze,,
1,28bde0c1-0703-4796-8138-ee3e90f6083b,U.S. founders are 'shameless' with feedback an...,In the latest comparison between American and ...,https://www.cnbc.com/2025/10/11/american-found...,Sawdah Bhaimiya,https://image.cnbcfm.com/api/v1/image/10820977...,en,['world'],2025-10-11 02:07:12 +0000,2025-10-11 06:00:14.696096,system,NaT,,True,scheduled__2025-10-10T06:00:00+00:00,currentsapi,bronze,,
2,8e76dd13-f862-45b0-a4ca-3f2ce10d8221,US judge vows to rule 'soon' on Abrego Garcia'...,"GREENBELT, MD— A federal judge in Maryland on ...",https://www.foxnews.com/politics/us-judge-vows...,Fox News,https://a57.foxnews.com/static.foxnews.com/fox...,en,['general'],2025-10-11 01:59:04 +0000,2025-10-11 06:00:14.696096,system,NaT,,True,scheduled__2025-10-10T06:00:00+00:00,currentsapi,bronze,,
3,73fa763f-f3fd-4981-b1b7-95a5e2ed4e55,Trump declared in 'excellent overall health' b...,"President Donald Trump is in ""excellent overal...",https://www.foxnews.com/politics/trump-declare...,Fox News,https://a57.foxnews.com/static.foxnews.com/fox...,en,['general'],2025-10-11 01:13:56 +0000,2025-10-11 06:00:14.696096,system,NaT,,True,scheduled__2025-10-10T06:00:00+00:00,currentsapi,bronze,,
4,0ffe6100-8d63-4256-a03b-fbc2c802dbc0,"Joe Flacco reacts to fresh start with Bengals,...",Joe Flacco will once again wear a different un...,https://www.foxnews.com/sports/joe-flacco-reac...,Fox News,https://a57.foxnews.com/static.foxnews.com/fox...,en,['general'],2025-10-11 00:58:56 +0000,2025-10-11 06:00:14.696096,system,NaT,,True,scheduled__2025-10-10T06:00:00+00:00,currentsapi,bronze,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,6ecb33de-562c-4da7-b751-186b8168211b,Exclusive: DHS Vows Deportation Numbers Will I...,"On Wednesday's ""Alex Marlow Show,"" DHS Assista...",https://www.breitbart.com/clips/2025/10/15/exc...,Breitbart TV,https://media.breitbart.com/media/2025/10/1015...,en,['politics'],2025-10-16 00:55:08 +0000,2025-10-16 06:00:34.145446,system,NaT,,True,scheduled__2025-10-15T06:00:00+00:00,currentsapi,bronze,,
479,172c9827-4d58-4c0b-8b63-a523de0f1e5d,Blue city judge cites 'fear or obstruction' in...,A Chicago judge signed an order Wednesday proh...,https://www.foxnews.com/us/blue-city-judge-cit...,Fox News,https://a57.foxnews.com/static.foxnews.com/fox...,en,['regional'],2025-10-16 00:48:27 +0000,2025-10-16 06:00:34.145446,system,NaT,,True,scheduled__2025-10-15T06:00:00+00:00,currentsapi,bronze,,
480,40336713-2642-486c-bc30-8e4ff1b6cccc,State Department aware of reports after Americ...,The State Department told Fox News that it is ...,https://www.foxnews.com/world/state-department...,Fox News,https://a57.foxnews.com/static.foxnews.com/fox...,en,['world'],2025-10-16 00:28:50 +0000,2025-10-16 06:00:34.145446,system,NaT,,True,scheduled__2025-10-15T06:00:00+00:00,currentsapi,bronze,,
481,57304b27-f238-4c62-9145-17df3460056e,"Katie Porter Now Regrets Mistreating Reporter,...","Former Rep. Katie Porter (D-CA), the potential...",https://www.breitbart.com/politics/2025/10/15/...,Paul Bois,https://media.breitbart.com/media/2025/10/Kati...,en,['politics'],2025-10-16 00:22:37 +0000,2025-10-16 06:00:34.145446,system,NaT,,True,scheduled__2025-10-15T06:00:00+00:00,currentsapi,bronze,,


# Import Data

In [None]:
# Define cleansing and transform strategies
cleansing_db_strategies = [
    BaseCleansing.DropColumnsCleansing(columns=[
        'id', 'source', 'author', 'imageurl', 'publishedtime', 'url',
        'language', 'createdate', 'usercreate', 'updatedate', 'userupdate', 'activedata'
    ]),
    BaseCleansing.DropDuplicateKeepFirstCleansing(),
    BaseCleansing.SpaceCleansing(columns=['titlenews', 'description']),
    BaseCleansing.ReplaceForCleansing(columns=['titlenews', 'description'], to_replace=["", " "], to_replace_with=np.nan),
    BaseCleansing.DropNaCleansing(['titlenews']),
    BaseCleansing.LowerCaseCleansing(columns=['titlenews', 'description', 'category', 'source', 'author']),
]

transform_db_strategies = [
    # BaseTransform.AstypeTransform(dtype_map={
    #     'titlenews': 'str',
    #     'description': 'str',
    #     'category': 'category'
    # }),
    BaseTransform.RenameColumnsTransform(rename_map={
        'titlenews': 'title',
        'description': 'description',
        'category': 'categories'
    }),
]

# Load, cleanse, and transform data


cleansing_db = BaseCleansing.DataCleansing(cleansing_db_strategies)
transform_db = BaseTransform.DataTransform(transform_db_strategies)

df_mas_news = cleansing_db.clean(df_mas_news)
df_mas_news = transform_db.transform(df_mas_news)
df_mas_news.head()


In [None]:
# Define cleansing and transform strategies
cleansing_csv_strategies = [
        BaseCleansing.DropColumnsCleansing(columns=['Source','Author','Sentiment', 'URL','Published At']),
        BaseCleansing.DropDuplicateKeepFirstCleansing(),
        BaseCleansing.SpaceCleansing(columns=['Title', 'Description']),
        BaseCleansing.ReplaceForCleansing(columns=['Title', 'Description'], to_replace=[""," "], to_replace_with=np.nan),
        BaseCleansing.DropNaCleansing(['Title']),
        BaseCleansing.LowerCaseCleansing(columns=['Title', 'Description','Type','Source','Author']),
        ]
transform_csv_strategies =[
    #     BaseTransform.AstypeTransform(dtype_map={
    #     'Title': 'str',
    #     'Description': 'str',
    #     'Type':'category'
    # }),
        BaseTransform.RenameColumnsTransform(rename_map={
        'Title': 'title',
        'Description': 'description',
        'Type': 'categories'
    })
]

# Load, cleanse, and transform data
Extractor_csv = DataExtractor.get_extractor(cfg={
    "type": "csv",
    "path": os.path.join(os.path.abspath(os.path.join('../../..')), "data", "raw","source_clovis_vieira_news_sentiment_analysis", "data.csv"),
    })
df_clovis_vieira = Extractor_csv.extractor()

cleansing_csv = BaseCleansing.DataCleansing(cleansing_csv_strategies)
transform_csv = BaseTransform.DataTransform(transform_csv_strategies)

df_clovis_vieira = cleansing_csv.clean(df_clovis_vieira)
df_clovis_vieira = transform_csv.transform(df_clovis_vieira)
df_clovis_vieira.head()

In [None]:
# Define cleansing and transform strategies
cleansing_bbc_strategies = [
        BaseCleansing.DropColumnsCleansing(columns=['filename']),
        BaseCleansing.DropDuplicateKeepFirstCleansing(),
        BaseCleansing.SpaceCleansing(columns=['News Articles', 'Summaries']),
        BaseCleansing.ReplaceForCleansing(columns=['News Articles', 'Summaries'], to_replace=[""," "], to_replace_with=np.nan),
        BaseCleansing.DropNaCleansing(['News Articles']),
        BaseCleansing.LowerCaseCleansing(columns=['News Articles', 'Summaries']),
        ]
transform_bbc_strategies =[
    #     BaseTransform.AstypeTransform(dtype_map={
    #     'News Articles': 'str',
    #     'Summaries': 'str',
    #     'category':'category'
    # }),
        BaseTransform.RenameColumnsTransform(rename_map={
        'News Articles': 'title',
        'Summaries': 'description',
        'category': 'categories'
    }),
    ]


# Load, cleanse, and transform data
Extractor_bbc = DataExtractor.get_extractor(cfg={
    "type": "csv",
    "path": os.path.join(os.path.abspath(os.path.join('../../..')), "data","raw", "source_pariza_sharif_BBC_news_summary", "BBC News Summary","local_bbc_pariza_sharif.csv")
    })
df_pariza_sharif = Extractor_bbc.extractor()

cleansing_bbc = BaseCleansing.DataCleansing(cleansing_bbc_strategies)
transform_bbc = BaseTransform.DataTransform(transform_bbc_strategies)

df_pariza_sharif = cleansing_bbc.clean(df_pariza_sharif)
df_pariza_sharif = transform_bbc.transform(df_pariza_sharif)
df_pariza_sharif.head()

In [None]:
combiner = [
    BaseCombining.ConcatenateDataFramesCombining([df_clovis_vieira, df_pariza_sharif])
]
combined = BaseCombining.DataCombining(combiner)
df = combined.combine(df_mas_news)
df.head()
df.shape

In [None]:
cleansing_df_strategies = [
        BaseCleansing.DropDuplicateKeepFirstCleansing(),
    ]
transform_df_strategies = [
    BaseTransform.ReplaceValuesTransform(column='categories',replace_map={
        'sports': 'sport', 
        'technology': 'tech'
    })
    ]

clean_df = BaseCleansing.DataCleansing(cleansing_df_strategies)
transform_df = BaseTransform.DataTransform(transform_df_strategies)
df = clean_df.clean(df)
df = transform_df.transform(df)


In [None]:
strategies_viz = [
    BaseViz.BarplotUniViz(
        sns_kwargs={"palette": "pastel", "hue": "category", "legend": False},
        plt_kwargs={"figsize": (10, 5), "title": 'categories', "rotation": 20, "fontsize":10}
        )
]
viz = BaseViz.UniVisualizer(strategies_viz)
viz.visualize(df,'categories')

In [None]:
# mapping = {'business': 0, 'entertainment': 1, 'general': 2, 'health': 3, 'politics': 4, 'science': 5, 'sport': 6, 'tech': 7}
# strategies_encoder = [
#     BaseEncoder.CustomMapEncoder(columns=['categories'], mapping=mapping)
#     ]
# strategies_transform = [
#     BaseTransform.AstypeTransform(dtype_map={
#         'title': 'string',
#         'description': 'string',
#         'categories': 'category',
#         'mapped_categories': 'int64'
#     })

# ]
# encoder = BaseEncoder.DataEncoder(strategies_encoder )
# transform= BaseTransform.DataTransform(strategies_transform)
# df = encoder.encode(df)
# df = transform.transform(df)

In [None]:
strategies_inspention_v1 = [
    BaseInspector.ShapeInspection(),
    BaseInspector.DtypeInspection(),
    BaseInspector.NullInspection(),
    BaseInspector.DuplicatesInspection(),
    BaseInspector.ValueCountsInspection(columns=['categories'])
]
inspector = BaseInspector.DataInspector(strategies_inspention_v1)
inspector.inspect(df)

In [None]:
strategies_inspention_v2 = [
    #BaseInspector.DescribeInspection(), 
    BaseInspector.InfoInspection()
]
inspector.set_strategies(strategies_inspention_v2)
inspector.inspect(df)

In [None]:
import datetime
da = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
full_path = os.path.join(
    os.path.abspath(os.path.join('../../..')),
    "data",
    "interim",
    "data_news_segmentation",
    "description_null",
    f"newsseg_desc_null_v{da}.csv"
)
df.to_csv(full_path, index=False)
print(f"Data saved to {full_path}")