In [11]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU instead.")

Using GPU: Tesla T4


In [12]:
import requests
from bs4 import BeautifulSoup
import json
import os

url = 'https://jang.com.pk/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
main_story_home_divs = soup.find_all('div', class_='main-story-home')
a_tags = [a['href'] for div in main_story_home_divs for a in div.find_all('a', href=True)]

print("Extracted <a> tags:")
for a_tag in a_tags:
    print(a_tag)


Extracted <a> tags:
https://jang.com.pk/news/1421166
https://jang.com.pk/news/1421175
https://jang.com.pk/news/1421176
https://jang.com.pk/news/1421181
https://jang.com.pk/news/1421180
https://jang.com.pk/news/1421164
https://jang.com.pk/news/1421167
https://jang.com.pk/news/1421174
https://jang.com.pk/news/1421193
https://jang.com.pk/news/1421184
https://jang.com.pk/news/1421170
https://jang.com.pk/news/1421177
https://jang.com.pk/news/1421163
https://jang.com.pk/news/1421146
https://jang.com.pk/news/1421183
https://jang.com.pk/news/1421149
https://jang.com.pk/news/1421165
https://jang.com.pk/news/1421159
https://jang.com.pk/news/1421185
https://jang.com.pk/news/1421187
https://jang.com.pk/news/1421179
https://jang.com.pk/news/1421174
https://jang.com.pk/news/1421171
https://jang.com.pk/news/1421158
https://jang.com.pk/news/1421155
https://jang.com.pk/news/1421151
https://jang.com.pk/news/1421148
https://jang.com.pk/news/1421142
https://jang.com.pk/news/1421182
https://jang.com.pk/new

In [13]:

os.makedirs('jang_articles', exist_ok=True)

In [14]:
for idx, a_tag in enumerate(a_tags, start=1):
    try:
        print(f"Processing {a_tag}")
        response = requests.get(a_tag)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all p tags within the article
        p_tags = soup.find_all('p')

        if p_tags:
            # Extract text from all p tags
            all_text = [p.get_text(strip=True) for p in p_tags]

            # Create a dictionary with paragraph numbers as keys
            data = {f"p{i + 1}": paragraph for i, paragraph in enumerate(all_text)}
            json_data = json.dumps(data, indent=4, ensure_ascii=False)
            with open(f'jang_articles/article_{idx}.json', 'w', encoding='utf-8') as f:
                f.write(json_data)

            print(f'Data has been saved to jang_articles/article_{idx}.json')
        else:
            print(f"No p tags found for {a_tag}")
    except Exception as e:
        print(f"Error processing {a_tag}: {e}")

Processing https://jang.com.pk/news/1421166
Data has been saved to jang_articles/article_1.json
Processing https://jang.com.pk/news/1421175
Data has been saved to jang_articles/article_2.json
Processing https://jang.com.pk/news/1421176
Data has been saved to jang_articles/article_3.json
Processing https://jang.com.pk/news/1421181
Data has been saved to jang_articles/article_4.json
Processing https://jang.com.pk/news/1421180
Data has been saved to jang_articles/article_5.json
Processing https://jang.com.pk/news/1421164
Data has been saved to jang_articles/article_6.json
Processing https://jang.com.pk/news/1421167
Data has been saved to jang_articles/article_7.json
Processing https://jang.com.pk/news/1421174
Data has been saved to jang_articles/article_8.json
Processing https://jang.com.pk/news/1421193
Data has been saved to jang_articles/article_9.json
Processing https://jang.com.pk/news/1421184
Data has been saved to jang_articles/article_10.json
Processing https://jang.com.pk/news/142

In [15]:
import re

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split('(\d+)', s)]

article_files = sorted([filename for filename in os.listdir('jang_articles') if filename.endswith('.json')],
                      key=natural_sort_key)

for filename in article_files:
    filepath = os.path.join('jang_articles', filename)
    with open(filepath, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    print(f"Content of {filename}:")
    print(json.dumps(json_data, indent=4, ensure_ascii=False))
    print("-" * 20)

Content of article_1.json:
{
    "p1": "",
    "p2": "",
    "p3": "",
    "p4": "",
    "p5": "وزیر دفاع خواجہ آصف نے کہا ہے کہ  پی ٹی آئی کبھی سول نافرمانی اور کبھی مذاکرات کی بات کرتی ہے۔ جو ماحول پی ٹی آئی نے بنادیا ہے اس میں کوئی مذاکرات نتیجہ خیز نہیں ہوسکتے۔",
    "p6": "سیالکوٹ میں جیو نیوز سے خصوصی گفتگو میں خواجہ آصف نے کہا کہ  پی ٹی آئی سے مذاکرات شروع ہونے کی کوئی بات ان کے علم میں نہیں۔ مذاکرات کے بارے میں  جو بیانات آرہے ہیں وہ پی ٹی آئی کی جانب سے آرہے ہیں، حکومت کی جانب سے کوئی بیان نہیں آیا ہے۔",
    "p7": "خواجہ آصف کا کہنا تھا کہ بانی پی ٹی آئی کے جیل سے کچھ بیان آتے ہیں، بہنوں کے لاہور سے کچھ اور کےپی سے بشریٰ بی بی کے کچھ بیانات آتے ہیں، ان کے بیانات میں کوئی ہم آہنگی نہیں۔",
    "p8": "خواجہ آصف نے کہا کہ ان کی اطلاع ہے کہ فیض حمید کیخلاف فرد جرم ٹھوس شواہد پر مبنی ہے۔",
    "p9": "انہوں نے مزید کہا کہ سیاسی حالات معمول پر ہیں۔ مختلف پارٹیاں ہیں یہاں ایک دوسرے سے گلے شکوے ہوتے رہتے ہیں۔ پارٹیوں کے اپنے ٹارگٹ ہوتے ہیں جسے پانے کیلئے ایسے بیانات آتے رہتے ہیں۔",
   

In [None]:



# Import required librimport os
import json
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pyngrok import ngrok



# Write the Streamlit app to app.py
with open("app.py", "w") as f:
    f.write("""
import re
import os
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import streamlit as st

# Function to handle whitespace
WHITESPACE_HANDLER = lambda k: re.sub('\\s+', ' ', re.sub('\\n+', ' ', k.strip() if hasattr(k, 'strip') else str(k)))

# Model initialization
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to generate summary for a single text
def generate_summary(text):
    input_ids = tokenizer(
        [WHITESPACE_HANDLER(text)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4
    )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )

    return summary

# Function to split long text and summarize each part
def split_and_summarize(text, max_chunk_size=512):
    chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    summaries = [generate_summary(chunk) for chunk in chunks]
    return ' '.join(summaries)

# Function to process a single article
def process_article(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        json_data = json.load(f)

    content_paragraphs = [value for key, value in json_data.items() if key != 'p1']
    content_text = ' '.join(content_paragraphs)

    # Check if content exceeds max length and split if necessary
    if len(tokenizer.encode(content_text)) > 512:
        summary = split_and_summarize(content_text)
    else:
        summary = generate_summary(content_text)

    return summary

# Streamlit app code
st.title("Jang News Summaries")
st.sidebar.title("Configuration")
article_dir = st.sidebar.text_input("Articles Directory", value="jang_articles")

# Ensure the directory exists
if os.path.exists(article_dir):
    # Natural sort of article files
    def natural_sort_key(s):
        return [int(text) if text.isdigit() else text.lower() for text in re.split('(\\d+)', s)]

    article_files = sorted(
        [filename for filename in os.listdir(article_dir) if filename.endswith('.json')],
        key=natural_sort_key
    )

    # Display summaries for each file
    for filename in article_files:
        filepath = os.path.join(article_dir, filename)
        try:
            with st.spinner(f"Processing {filename}..."):
                summary = process_article(filepath)
                st.subheader(filename)
                st.write(summary)
                st.write("---")  # Separator
        except Exception as e:
            st.error(f"Error processing {filename}: {e}")
else:
    st.error(f"Directory '{article_dir}' does not exist. Please check the path.")
""")

# Run the Streamlit app
!ngrok authtoken <YOUR AUTHTOKEN>
# Set up ngrok for exposing the Streamlit app
ngrok.kill()  # Close any existing tunnels to avoid conflicts
public_url = ngrok.connect(addr="8501").public_url  # Specify the port with 'addr'
print(f"Streamlit app is running at: {public_url}")


# Run the Streamlit app
!streamlit run app.py --server.port 8501 --server.enableCORS false --server.headless true


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Streamlit app is running at: https://61c7-34-125-252-77.ngrok-free.app
2024-12-15 17:06:03.080 
As a result, 'server.enableCORS' is being overridden to 'true'.

More information:
In order to protect against CSRF attacks, we send a cookie with each request.
To do so, we must specify allowable origins, which places a restriction on
cross-origin resource sharing.

If cross origin resource sharing is required, please disable server.enableXsrfProtection.
            

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.252.77:8501[0m
[0m
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expec