# Install required packages

In [1]:
!pip install pandas nltk textblob python-docx
!python -m textblob.download_corpora

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzip

In [2]:
import pandas as pd
import nltk
from textblob import TextBlob
from docx import Document
from collections import defaultdict
from google.colab import files

#  Download NLTK resources

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
# Directly read the uploaded file from the Colab filesystem
df = pd.read_csv("/content/Companies_Sentiment_Analysis_7_Days_Track.csv")

# Optional: preview the data
df.head()

Unnamed: 0,Ticker,Title,Summary,URL,Datetime,News Age (Days),News Time,Current Price,1 Hour Ago,4 Hours Ago,Previous Day Close,7 Days Ago,1 Hour Ago % Change,4 Hours Ago % Change,Previous Day Close % Change,7 Days Ago % Change,Headline Sentiment,Summary Sentiment
0,CSCO,SES Space & Defense Awarded Sustainment Tactic...,"RESTON, Va., August 04, 2025--SES’s wholly-own...",https://finance.yahoo.com/news/ses-space-defen...,2025-08-04 09:21:00-04:00,0,2025-08-04 09:21:00-04:00,68.18,68.18,68.18,,67.7,0.0,0.0,,0.71,0.0,0.87
1,CSCO,AppOmni Innovations Combat Rising Security Ris...,"LAS VEGAS, August 04, 2025--From Black Hat, Sa...",https://finance.yahoo.com/news/appomni-innovat...,2025-08-04 08:59:00-04:00,0,2025-08-04 08:59:00-04:00,68.18,68.18,68.18,,67.7,0.0,0.0,,0.71,0.0,0.69
2,CSCO,Cisco Systems Stock: Is Wall Street Bullish or...,Even though Cisco Systems has outpaced the bro...,https://www.barchart.com/story/news/33856248/c...,2025-08-04 08:16:14-04:00,0,2025-08-04 08:16:14-04:00,68.18,68.18,68.18,,67.7,0.0,0.0,,0.71,0.0,0.54
3,NET,Three Elite Growth Companies With Significant ...,As the U.S. stock market grapples with recent ...,https://finance.yahoo.com/news/three-elite-gro...,2025-08-04 07:35:31-04:00,0,2025-08-04 07:35:31-04:00,206.89,206.89,206.89,,199.74,0.0,0.0,,3.58,0.38,0.93
4,NET,Cloudflare (NET) Reports Increased Revenue Ami...,Cloudflare (NET) recently announced significan...,https://finance.yahoo.com/news/cloudflare-net-...,2025-08-01 14:06:20-04:00,2,2025-08-01 14:06:20-04:00,199.16,201.7,205.23,207.71,199.06,-1.26,-2.96,-4.12,0.05,0.12,0.75


**Keyword-Based Company Sentiment Classifier & Progress Summary Generator**

This script processes a stock market sentiment analysis dataset (df) and:

	1.	Defines a rich taxonomy of financial and business-related keyword categories (30+ types like earnings, merger, FDA approval, data privacy, labor, etc.).

	2.	Analyzes each company’s news summary using TextBlob to:
	•	Assign positive or negative sentiment based on polarity.
	•	Match text against predefined keyword categories using a fuzzy detector.

	3.	Groups matched categories per ticker based on sentiment (positive or negative).

	4.	Removes duplicates and sorts categories for clarity.

	5.	Generates a clean, human-readable Word document summarizing each company’s:
	•	✅ Positive keywords (e.g., “earnings”, “product launch”)
	•	❌ Negative keywords (e.g., “lawsuit”, “layoffs”)


📁 Output:

A Microsoft Word (.docx) file titled Company Progress Keyword Summary where:
	•	Each company ticker gets its own section
	•	Sentiment-tagged keyword categories are listed under:
	•	✅ Positive Keywords
	•	❌ Negative Keywords

In [5]:
# Define known financial keyword categories
KEYWORD_CATEGORIES = {
    "earnings": [
        "earnings", "profit", "profits", "revenue", "sales", "income",
        "loss", "losses", "guidance", "forecast", "EPS", "net income", "beat estimates",
        "missed estimates", "record profits", "quarterly results", "financial results", "operating margin"
    ],
    "merger": [
        "merger", "merging", "combine", "merge talks", "consolidation",
        "merger agreement", "merger approval", "merger blocked"
    ],
    "acquisition": [
        "acquisition", "acquire", "acquired", "buyout", "purchase", "takeover",
        "deal signed", "deal announced", "investment", "stake purchase", "ownership increase"
    ],
    "stimulus": [
        "stimulus", "government support", "funding", "bailout", "subsidy",
        "relief package", "stimulus package", "federal support", "tax credit"
    ],
    "FDA approval": [
        "FDA", "approval", "approved", "clinical", "clinical trial", "drug trial",
        "vaccine", "authorized", "emergency approval", "phase 3 trial", "trial success"
    ],
    "partnership": [
        "partnership", "collaboration", "joint venture", "strategic alliance",
        "deal partnership", "cooperation agreement", "multi-year contract"
    ],
    "product launch": [
        "launch", "introduced", "released", "new product", "rollout", "unveiled",
        "prototype", "product update", "new version", "flagship product"
    ],
    "dividend": [
        "dividend", "payout", "shareholder return", "buyback",
        "dividend increase", "dividend cut", "special dividend", "stock repurchase"
    ],
    "lawsuit": [
        "lawsuit", "legal case", "settlement", "court", "regulatory action",
        "antitrust", "fined", "penalty", "investigation", "regulator probe"
    ],
    "price target": [
        "price target", "upgrade", "downgrade", "analyst rating", "valuation",
        "buy rating", "sell rating", "neutral rating", "target raised", "target cut"
    ],
    "supply chain": [
        "supply chain", "logistics", "shipment", "delay", "inventory",
        "supply shortage", "backlog", "transportation issue", "production halt"
    ],
    "expansion": [
        "expansion", "growth plan", "new facility", "factory", "headquarters",
        "hiring", "job cuts", "layoffs", "new office", "opening new plant", "capacity increase"
    ],
    "market performance": [
        "stock surge", "stock drop", "market share", "benchmark", "index",
        "outperform", "underperform", "record high", "record low", "volatility", "sell-off", "rally"
    ],
    "technology": [
        "AI", "machine learning", "cloud", "software", "chip", "semiconductor",
        "innovation", "R&D", "5G", "blockchain", "cybersecurity", "patent approval"
    ],
    "economic outlook": [
        "inflation", "interest rates", "GDP", "unemployment", "recession",
        "economic slowdown", "rate hike", "rate cut", "consumer confidence"
    ],
    "environment": [
        "sustainability", "carbon", "green energy", "renewable", "ESG",
        "climate change", "solar project", "wind energy", "emission reduction"
    ],
    "crypto": [
        "bitcoin", "crypto", "cryptocurrency", "token", "blockchain",
        "NFT", "digital asset", "crypto regulation"
    ],
    "bankruptcy": [
        "bankruptcy", "insolvency", "debt restructuring", "chapter 11", "liquidation", "creditors"
    ],

    "leadership": [
        "CEO", "CFO", "executive", "leadership change", "resignation",
        "appointment", "management shuffle", "board member", "founder", "chairman"
    ],
    "data privacy": [
        "data breach", "cyber attack", "privacy", "security breach",
        "data leak", "ransomware", "hacked", "customer data", "information security"
    ],
    "regulatory": [
        "SEC", "compliance", "regulation", "regulatory", "approval process",
        "legal dispute", "financial oversight", "government policy", "sanction", "license"
    ],
    "labor": [
        "strike", "union", "labor dispute", "employee protest", "labor agreement",
        "collective bargaining", "walkout", "contract negotiation", "labor shortage"
    ],
    "customer activity": [
        "user growth", "subscription", "user base", "customer retention",
        "churn rate", "client expansion", "client loss", "subscriber count", "MAU", "DAU"
    ],
    "pricing": [
        "price hike", "pricing strategy", "discount", "rebate", "cost increase",
        "price cut", "premium pricing", "value pricing", "dynamic pricing"
    ],
    "competition": [
        "competitive pressure", "market rival", "challenger", "competing product",
        "market entry", "pricing war", "competition", "industry peer", "disruptor"
    ],
    "branding": [
        "rebrand", "branding strategy", "brand value", "brand loyalty",
        "brand recognition", "marketing campaign", "new logo", "brand refresh"
    ],
    "supply agreement": [
        "supply agreement", "long-term supply", "procurement deal",
        "supply contract", "vendor agreement", "sourcing deal"
    ],
    "legal win/loss": [
        "court victory", "lawsuit win", "lawsuit loss", "ruled in favor",
        "legal defeat", "legal win", "injunction", "verdict"
    ]
}

## Print out how many keyword categories we have and show their names
len(KEYWORD_CATEGORIES), list(KEYWORD_CATEGORIES.keys())

# Function to detect keywords from a piece of text
def detect_keywords(text):
    detected = []
    # Loop through every category and its keywords
    for category, words in KEYWORD_CATEGORIES.items():
        for word in words:
            # If any word from a category is found in the text, we add that category
            if word.lower() in text.lower():
                detected.append(category)
                break  # Only need to detect 1 keyword per category
    # If no keyword found, default to 'Other'
    return detected or ['Other']

# Dictionary to store keywords grouped by company and sentiment (positive or negative)
company_keywords = defaultdict(lambda: {'positive': [], 'negative': []})

# Loop through every row in the dataframe (each row = 1 article)
for _, row in df.iterrows():
    ticker = row['Ticker']
    summary = str(row['Summary'])  # Make sure it's a string, just in case

    # Run sentiment analysis using TextBlob (polarity ranges -1 to 1)
    blob = TextBlob(summary)
    sentiment = 'positive' if blob.sentiment.polarity >= 0 else 'negative'

    # Try to detect financial keyword categories from this article summary
    detected = detect_keywords(summary)

    # Add the detected categories to the appropriate sentiment group for this ticker
    company_keywords[ticker][sentiment].extend(detected)

# Clean up: remove duplicates and sort keywords alphabetically for each company
for ticker in company_keywords:
    for sentiment in ['positive', 'negative']:
        keywords = list(set(company_keywords[ticker][sentiment]))  # Remove duplicates
        company_keywords[ticker][sentiment] = sorted(keywords)     # Sort for neatness

# Create a Word document to store the result
doc = Document()
doc.add_heading('Company Progress Keyword Summary', 0)  # Main document title

# For each company, write their sentiment-based keyword summaries
for ticker, sentiment_data in sorted(company_keywords.items()):
    doc.add_heading(ticker, level=1)  # Company ticker as a heading

    # Join keywords into a readable string, or say "No data" if none found
    pos_keywords = ', '.join(sentiment_data['positive']) or "No data"
    neg_keywords = ', '.join(sentiment_data['negative']) or "No data"

    # Add positive and negative keyword lists to the document
    doc.add_paragraph(f"✅ Positive Keywords: {pos_keywords}")
    doc.add_paragraph(f"❌ Negative Keywords: {neg_keywords}")
    doc.add_paragraph("")  # Empty line for spacing

# At the end, you can save the doc using: doc.save("YourFilename.docx")

#  Save and download the DOCX

In [6]:
output_file = "Company_Progress_Summary.docx"
doc.save(output_file)
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>