In [2]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
df = pd.read_csv('../data/raw_analyst_ratings.csv')

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
sia = SentimentIntensityAnalyzer()


In [6]:
for headline in df['headline'].head(10):
    sentiment = sia.polarity_scores(headline)
    compound_score = sentiment['compound']
    
    if compound_score > 0:
        print(f"Positive: {headline}")
    elif compound_score < 0:
        print(f"Negative: {headline}")
    else:
        print(f"Neutral: {headline}")

Neutral: Stocks That Hit 52-Week Highs On Friday
Neutral: Stocks That Hit 52-Week Highs On Wednesday
Neutral: 71 Biggest Movers From Friday
Neutral: 46 Stocks Moving In Friday's Mid-Day Session
Positive: B of A Securities Maintains Neutral on Agilent Technologies, Raises Price Target to $88
Negative: CFRA Maintains Hold on Agilent Technologies, Lowers Price Target to $85
Neutral: UBS Maintains Neutral on Agilent Technologies, Raises Price Target to $87
Positive: Agilent Technologies shares are trading higher after the company reported better-than-expected Q2 EPS and sales results.
Negative: Wells Fargo Maintains Overweight on Agilent Technologies, Raises Price Target to $95
Neutral: 10 Biggest Price Target Changes For Friday


In [6]:
import spacy
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher

In [18]:
nlp = spacy.load('en_core_web_sm')
tokenizer = English().tokenizer


In [19]:
def extract_keywords(text):
    doc = nlp(text)
    
    keywords = []
    
    for chunk in doc.noun_chunks:
        keywords.append(chunk.text)
    
    topic_phrases = ["FDA approval", "price target"]
    
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(phrase) for phrase in topic_phrases]
    matcher.add("TopicPhrases", None, *patterns)
    
    matches = matcher(doc)
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        keywords.append(matched_span.text)
    
    return keywords

In [20]:
df['keywords'] = df["headline"].head().apply(extract_keywords)

Stocks That Hit 52-Week Highs On Friday
Stocks That Hit 52-Week Highs On Wednesday
71 Biggest Movers From Friday
46 Stocks Moving In Friday's Mid-Day Session
B of A Securities Maintains Neutral on Agilent Technologies, Raises Price Target to $88


In [17]:
print(df['keywords'])

0                      [Stocks, That, 52-Week Highs, Friday]
1                   [Stocks, That, 52-Week Highs, Wednesday]
2                                [71 Biggest Movers, Friday]
3                      [46 Stocks, Friday's Mid-Day Session]
4          [B, A Securities Maintains Neutral, Agilent Te...
                                 ...                        
1407323                                                  NaN
1407324                                                  NaN
1407325                                                  NaN
1407326                                                  NaN
1407327                                                  NaN
Name: keywords, Length: 1407328, dtype: object
