### Filter the Crime related articles

In [45]:
import pandas as pd
from transformers import pipeline
from datasets import Dataset

# Step 1: Load Article Data
df = pd.read_excel("thehindu_article_details.xlsx")
df["Content"] = df["Content"].fillna("")
df["Title"] = df["Title"].fillna("")


# Step 2: Define Crime-Related Labels
crime_labels = ["crime", "fraud", "murder", "scam", "non-crime"]


# Step 3: Load Zero-Shot Classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


# Step 4: Classify Articles

results = []
for idx, row in df.iterrows():
    text = f"{row['Title']}. {row['Content']}"

    result = classifier(text, candidate_labels=crime_labels, multi_label=True)
    label_scores = dict(zip(result["labels"], result["scores"]))

    # Define threshold for classifying as crime-related
    crime_score = max(label_scores.get(lbl, 0) for lbl in ["crime", "fraud", "murder", "scam"])
    is_crime_related = crime_score >= 0.6

    results.append({
        "Title": row["Title"],
        "Content": row["Content"],
        "URL": row.get("URL", ""),
        "Publication": row.get("Publication", ""),
        "is_crime_related": is_crime_related,
        "Top_Label": result["labels"][0],
        "Top_Score": round(result["scores"][0], 3)
    })


# Step 5: Create Final DataFrame

classified_df = pd.DataFrame(results)

# Filter only crime-related
crime_df = classified_df[classified_df["is_crime_related"] == True]

# Save results
classified_df.to_excel("all_classified_articles_zero_shot.xlsx", index=False)
crime_df.to_excel("filtered_crime_articles_zero_shot.xlsx", index=False)

print(f"Total crime-related articles: {len(crime_df)}")
print(f"Saved full classification to all_classified_articles_zero_shot.xlsx")


Device set to use cuda:0


Total crime-related articles: 45
Saved full classification to all_classified_articles_zero_shot.xlsx


In [265]:
crime_df.shape, classified_df.shape

((39, 7), (126, 7))

### NER and Sentiment Analysis

In [8]:
!pip install spacy transformers nltk
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [46]:
from collections import Counter
def aggregate_sentiment(group):
    label_counts = Counter(group["sentiment_label"])

    # Choose the most common label
    final_label = label_counts.most_common(1)[0][0]

    # Optionally average the sentiment scores
    avg_score = round(group["sentiment_score"].mean(), 3)

    return pd.Series({
        "age": group["age"].iloc[0],
        "article_url": group["article_url"].iloc[0],
        "article_title": group["article_title"].iloc[0],
        "article_date": group["article_date"].iloc[0],
        "final_sentiment": final_label,
        "avg_sentiment_score": avg_score,
        "num_mentions": len(group)
    })

In [47]:
import pandas as pd
import re
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

nltk.download('punkt_tab')


# Load spaCy and sentiment model

nlp = spacy.load("en_core_web_sm")

model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_model = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, return_all_scores=True)


#Load articles

df = pd.read_excel("/content/filtered_crime_articles_zero_shot.xlsx").fillna("")

records = []

for _, row in df.iterrows():
    title = row["Title"]
    content = row["Content"]
    url = row["URL"]
    date = row["Publication"]

    full_text = f"{title}. {content}"
    sentences = sent_tokenize(full_text)
    doc = nlp(full_text)

    persons = set(ent.text for ent in doc.ents if ent.label_ == "PERSON")

    for person in persons:
        related_sents = [s for s in sentences if person in s]

        # Sentiment per person
        sentiments = []
        for sent in related_sents:
            try:
                result = sentiment_model(sent)[0]
                scores = {res["label"]: res["score"] for res in result}

                label_map = {
                    "LABEL_0": "NEGATIVE",
                    "LABEL_1": "NEUTRAL",
                    "LABEL_2": "POSITIVE"
                }
                pred_label = max(scores, key=scores.get)
                sentiment_label = label_map[pred_label]
                sentiment_score = round(scores[pred_label], 3)

                # Age extraction
                age_match = re.search(r"(\d{1,2})[- ]?year[- ]?old", sent)
                age = age_match.group(1) if age_match else None

                records.append({
                    "name": person,
                    "age": age,
                    "article_url": url,
                    "article_title": title,
                    "article_date": date,
                    "matched_sentence": sent,
                    "sentiment_label": sentiment_label,
                    "sentiment_score": sentiment_score
                })
            except Exception:
                continue


#Save results
all_profiles = pd.DataFrame(records)

# Group by person + article
all_profiles= all_profiles.groupby(["name", "article_url"]).apply(aggregate_sentiment)

# Filter by label if needed
negative_profiles = all_profiles[all_profiles["final_sentiment"] == "NEGATIVE"]

# Save to Excel
all_profiles.to_excel("all_profiles_sentiment_output.xlsx", index=False)
negative_profiles.to_excel("negative_profiles_sentiment_output.xlsx", index=False)

print(f"Saved {len(all_profiles)} total profiles.")
print(f"Negative profiles: {len(negative_profiles)}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cuda:0


Saved 301 total profiles.
Negative profiles: 62


  all_profiles= all_profiles.groupby(["name", "article_url"]).apply(aggregate_sentiment)


In [32]:
all_profiles.shape, negative_profiles.shape

((301, 7), (62, 7))

In [42]:
negative_profiles.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,age,article_url,article_title,article_date,final_sentiment,avg_sentiment_score,num_mentions
name,article_url,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C. Ganesan,https://www.thehindu.com/news/cities/Coimbatore/coimbatore-resident-gets-7-years-ri-for-attempting-to-murder-brother-in-2017/article69755312.ece,,https://www.thehindu.com/news/cities/Coimbator...,Coimbatore resident gets 7 years RI for attemp...,2025-06-30T18:11:44+05:30,NEGATIVE,0.632,1
Kanjayya,https://www.thehindu.com/news/national/andhra-pradesh/man-kills-wifes-parents-in-duttalur-of-aps-nellore-district/article69754425.ece,,https://www.thehindu.com/news/national/andhra-...,Man kills wife’s parents in Duttalur of A.P.’s...,2025-06-30T13:33:47+05:30,NEGATIVE,0.837,1


In [48]:
all_profiles.sample(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,age,article_url,article_title,article_date,final_sentiment,avg_sentiment_score,num_mentions
name,article_url,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Jaiswal,https://www.thehindu.com/news/national/bihar/bihar-bjp-attempts-to-break-rjds-muslim-votes-through-pasmanda/article69755595.ece,,https://www.thehindu.com/news/national/bihar/b...,Bihar BJP attempts to break RJD’s Muslim votes...,2025-07-01T01:20:00+05:30,NEUTRAL,0.773,7
Gopal,https://www.thehindu.com/news/national/himachal-pradesh/shimla-rain-five-storey-building-collapses-in-shimla-landslips-block-highways-at-multiple-places/article69755432.ece,,https://www.thehindu.com/news/national/himacha...,Shimla rain: Five-storey building collapses in...,2025-06-30T18:11:25+05:30,NEUTRAL,0.932,1


### Ouput File contains the Negative Profiles

In [43]:
negative_profiles.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,age,article_url,article_title,article_date,final_sentiment,avg_sentiment_score,num_mentions
name,article_url,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Majhi,https://www.thehindu.com/news/national/odisha/senior-odisha-officer-dragged-from-office-brutally-attacked-in-bhubaneswar/article69756527.ece,,https://www.thehindu.com/news/national/odisha/...,"Senior officer dragged from office, beaten up ...",2025-06-30T23:32:01+05:30,NEGATIVE,0.686,2
Ranjana Verma,https://www.thehindu.com/news/national/himachal-pradesh/shimla-rain-five-storey-building-collapses-in-shimla-landslips-block-highways-at-multiple-places/article69755432.ece,,https://www.thehindu.com/news/national/himacha...,Shimla rain: Five-storey building collapses in...,2025-06-30T18:11:25+05:30,NEGATIVE,0.755,1
Haritha Karma Sena,https://www.thehindu.com/news/national/kerala/ldf-reiterates-demand-for-resignation-of-bjp-councillors-over-corruption-allegations/article69756116.ece,,https://www.thehindu.com/news/national/kerala/...,LDF reiterates demand for resignation of BJP c...,2025-06-30T20:55:31+05:30,NEGATIVE,0.639,1
Muthukumar,https://www.thehindu.com/news/cities/Coimbatore/coimbatore-resident-gets-7-years-ri-for-attempting-to-murder-brother-in-2017/article69755312.ece,,https://www.thehindu.com/news/cities/Coimbator...,Coimbatore resident gets 7 years RI for attemp...,2025-06-30T18:11:44+05:30,NEGATIVE,0.706,6
Subathu,https://www.thehindu.com/news/national/himachal-pradesh/shimla-rain-five-storey-building-collapses-in-shimla-landslips-block-highways-at-multiple-places/article69755432.ece,,https://www.thehindu.com/news/national/himacha...,Shimla rain: Five-storey building collapses in...,2025-06-30T18:11:25+05:30,NEGATIVE,0.582,1


In [49]:
import pandas as pd
import re
import spacy
import nltk
from transformers import pipeline

# Print versions
print("pandas:", pd.__version__)
print("spacy:", spacy.__version__)
print("nltk:", nltk.__version__)

# For Hugging Face Transformers
import transformers
print("transformers:", transformers.__version__)

# Check torch version as it's a backend for transformers
import torch
print("torch:", torch.__version__)


pandas: 2.2.2
spacy: 3.8.7
nltk: 3.9.1
transformers: 4.53.0
torch: 2.6.0+cu124
