In [1]:
import newspaper
import pandas as pd

In [2]:
link = 'https://www.cnn.com'
# Scans the webpage and finds all the links on it.
page_features = newspaper.build(link, language='en', memoize_articles=False)
# Initialize a list for article titles and text.


In [3]:
from tqdm.auto import tqdm

def extract() -> pd.DataFrame:
    link = 'https://www.cnn.com'
    # Scans the webpage and finds all the links on it.
    page_features = newspaper.build(
        link, 
        language='en', 
        memoize_articles=False
    )

    articles = tqdm(page_features.articles[:100])
    data = []
    for article in articles:
        try:
            # Each article must be downloaded, then parsed individually.
            # This loads the text and title from the webpage to the object.
            article.download()
            article.parse()

            if not article.url.startswith('https://edition.cnn.com'):
                # Keep the text, title and URL from the article and append to a list.
                data.append({
                    'title':article.title,
                    'article':article.text,
                    'url': article.url})
        except:
            # If, for any reason the download fails, continue the loop.
            print("Article Download Failed.")

    df = pd.DataFrame.from_dict(data)
    df = df[df['article'].str.len() < 10000]
    return df

In [4]:
df = extract()
df

  0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,title,article,url
0,CNN Business,1. How relevant is this ad to you?\n\nVideo pl...,https://www.cnn.com/business/media
1,Live updates: Wagner chief stoking 'armed rebe...,Equipment of the Wagner private military compa...,https://www.cnn.com/europe/live-news/russia-uk...
2,Timeline of the Titanic sub implosion and search,CNN —\n\nA day before the disappearance of the...,https://www.cnn.com/2023/06/24/us/missing-tita...
3,NHL joins MLB in advising teams to halt wearin...,CNN —\n\nNational Hockey League Commissioner G...,https://www.cnn.com/2023/06/23/us/mlb-nhl-prid...
4,The ultra-wealthy have dangerous pastimes. Who...,"New York CNN —\n\nThroughout history, humans h...",https://www.cnn.com/2023/06/24/business/extrem...
...,...,...,...
95,In pictures: The collapse of Ukraine’s Nova Ka...,The Nova Kakhovka dam in southern Ukraine coll...,https://www.cnn.com/2023/06/07/world/gallery/u...
96,How Ukraine's ‘iron people’ keep the country o...,Dmitrii Prishedko and Victor Bondar operate an...,https://www.cnn.com/interactive/2023/05/world/...
97,A federal judge temporarily blocks a Florida l...,CNN —\n\nA federal judge on Friday temporarily...,https://www.cnn.com/2023/06/23/us/drag-queen-s...
98,How much will Lake Mead rise? Maps and charts ...,"CNN —\n\nLake Mead, the nation’s largest reser...",https://www.cnn.com/2023/06/24/us/how-much-lak...


In [5]:
import requests

def post_req(url:str, params = None, data: dict = None) -> requests.Response:
    headers = {
        "Content-type": "application/json",
    }
    try:
        r = requests.post(url, params = params, json=data, headers=headers)
    except Exception as e:
        print("error happen here:\n", e)
    else:
        if r.status_code == 200:
            return r
        else:
            print("request code is not 200")

In [6]:
def transform(model_uri: str, df: pd.DataFrame) -> pd.DataFrame:
    df = df.sample(10)  # TODO: delete this
    # generate text summarization
    r = post_req(
        url = f"http://{model_uri}:8000/article/summarize_batch", 
        data = dict(
            articles=dict(articles=df['article'].tolist()),
            config=dict(num_beans=8, temperature=1.0)    
        )
    )
    df.loc[:, 'summary'] = r.json() 
    return df

In [7]:
df = transform("172.17.0.1", df)
df

Unnamed: 0,title,article,url,summary
79,"Gannett sues Google, alleging it has an online...","CNN —\n\nGannett, the largest newspaper publis...",https://www.cnn.com/2023/06/20/tech/gannett-ne...,The publisher of USA Today and more than 200 l...
32,JPMorgan fined $4 million by SEC for deleting ...,New York CNN —\n\nThe Securities and Exchange ...,https://www.cnn.com/2023/06/23/business/jpmorg...,JPMorgan Chase fined JPMorgan Chase $4 million...
60,Abortion is ancient history and that matters t...,"CNN —\n\nAbortion today, at least in the Unite...",https://www.cnn.com/2023/06/23/health/abortion...,"Egyptian papyrus, Roman coins, medieval biogra..."
84,Victor Wembanyama Celebrated by Fans After Goi...,TIMOTHY A. CLARY/AFP via Getty Images\n\nAfter...,https://bleacherreport.com/articles/10080347-v...,Wembanyama is expected to be the No.
68,Fact check: Big differences between Hunter Bid...,Washington CNN —\n\nMany Republicans have crit...,https://www.cnn.com/2023/06/23/politics/fact-c...,Some Republicans have focused on how the prose...
54,The world’s 50 best restaurants revealed,CNN —\n\nPeru’s capital city – and undoubtedly...,https://www.cnn.com/travel/worlds-50-best-rest...,The 50 Best Restaurants awards took the No.
71,A timeline of the abortion access landscape on...,How the frenzy of legal actions shifted the la...,https://www.cnn.com/interactive/2023/06/us/abo...,The Supreme Court has issued its ruling in Dob...
4,The ultra-wealthy have dangerous pastimes. Who...,"New York CNN —\n\nThroughout history, humans h...",https://www.cnn.com/2023/06/24/business/extrem...,This week's catastrophic implosion of the Ocea...
63,Why Chinese students are taking graduation pho...,Hong Kong CNN —\n\nOne photo shows the young w...,https://www.cnn.com/2023/06/23/china/china-gra...,Chinese social media has become awash with ton...
9,Analysis: Moscow is at risk of losing its iron...,CNN —\n\nThis just does not happen in Vladimir...,https://www.cnn.com/2023/06/24/europe/russia-p...,Russian president is facing the most serious t...


In [10]:
import mysql.connector

with mysql.connector.connect(
    host='localhost',
    database='summarizer',
    user='root',
    password='luntaixia'
) as conn:
    with conn.cursor() as cursor:
        mysql_quey = """
        INSERT INTO summarizer.batch_summarization (title, article, `url`, summary) 
        VALUES (%s, %s, %s, %s)
        """
        params = list(df.itertuples(index=False, name=None))
        cursor.executemany(mysql_quey, params)
        conn.commit()

In [9]:
from sqlalchemy import create_engine

def load(df: pd.DataFrame, conn_str:str):
    engine = create_engine(conn_str)

    with engine.begin() as conn:
        df.to_sql(
            name = 'batch_summarization', 
            con = conn, 
            schema = 'summarizer',
            if_exists = 'append',
            index = False,
            method = 'multi',
            chunksize = 1000
        )
