In [1]:
!pip install pandas spacy geopy transformers sumy rouge-score torch sentencepiece

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
import spacy
from geopy.geocoders import Nominatim
from transformers import pipeline
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

In [6]:
# Change path to your dataset location
file_path = "/content/train.csv"

# Load dataset with error handling
df = pd.read_csv(file_path, encoding="latin1", on_bad_lines="skip", engine="python")

# Keep only useful columns
df = df[['article', 'highlights']]
df = df.dropna().reset_index(drop=True)

print("Dataset shape:", df.shape)
print(df.head(2))

Dataset shape: (287113, 2)
                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   

                                          highlights  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  


In [7]:
nlp = spacy.load("en_core_web_sm")

# Load abstractive summarizer (T5-small)
summarizer = pipeline("summarization", model="t5-small", tokenizer="t5-small")

# Initialize geocoder
geolocator = Nominatim(user_agent="geo_summarizer")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


In [8]:
def textrank_summary(text, sentence_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_textrank = TextRankSummarizer()
    summary = summarizer_textrank(parser.document, sentence_count)
    return " ".join([str(sentence) for sentence in summary])

# Abstractive (T5-small)
def t5_summary(text, max_len=100, min_len=30):
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
    return summary[0]['summary_text']

In [9]:
def extract_locations(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    return list(set(locations))

# Convert place → coordinates
def geocode_location(location):
    try:
        loc = geolocator.geocode(location)
        if loc:
            return (loc.latitude, loc.longitude)
    except:
        return None

In [10]:
def process_article(article, method="t5"):
    # Summarization
    if method == "textrank":
        summary = textrank_summary(article, sentence_count=3)
    else:
        summary = t5_summary(article)

    # Geo-tagging
    locations = extract_locations(article)
    coords = [(loc, geocode_location(loc)) for loc in locations if geocode_location(loc)]

    return summary, coords

In [11]:
results = []

# Test only on first 5 articles (for speed)
for i in range(5):
    article = df['article'][i]
    summary, geo_info = process_article(article, method="t5")  # change to "textrank" if needed

    for loc, coord in geo_info:
        results.append({
            "Article_ID": i,
            "Summary": summary,
            "Location": loc,
            "Latitude": coord[0],
            "Longitude": coord[1]
        })

output_df = pd.DataFrame(results)
output_df.to_csv("summaries_with_geotags.csv", index=False)
print(output_df.head())

Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have bee

   Article_ID                                            Summary  \
0           0  bishop of the fargo Catholic Diocese in north ...   
1           0  bishop of the fargo Catholic Diocese in north ...   
2           0  bishop of the fargo Catholic Diocese in north ...   
3           0  bishop of the fargo Catholic Diocese in north ...   
4           1  a criminal complaint accuses the 45-year-old o...   

       Location   Latitude   Longitude  
0  North Dakota  47.620146 -100.540737  
1         Fargo  46.877229  -96.789821  
2   Grand Forks  47.925210  -97.030632  
3         Italy  42.638426   12.674297  
4          U.S.  39.783730 -100.445882  


In [12]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

ref = df['highlights'][0]   # reference summary
art = df['article'][0]
pred_summary, _ = process_article(art, method="t5")

score = scorer.score(ref, pred_summary)
print("ROUGE Score:", score)

Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


ROUGE Score: {'rouge1': Score(precision=0.275, recall=0.3235294117647059, fmeasure=0.2972972972972973), 'rougeL': Score(precision=0.175, recall=0.20588235294117646, fmeasure=0.18918918918918917)}
