In [None]:
import pandas as pd
import spacy
import time
from textblob import TextBlob
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
#import yfinance as yf
import nltk
from nltk.corpus import stopwords

# path to the CSV file
file_path = (r"C:/Users/pc/Desktop/10 Academy/Week 1/sentiment-analysis/nootbooks/data/raw_analyst_ratings.csv")

df = pd.read_csv(file_path)

df.head()

In [None]:
df.isna().sum()

In [None]:
df.info()
df['headline']
df['headline'] .nunique()
df['headline'] .value_counts().head()

In [None]:
#applying sentiment to the headline
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity

# applying sentiment analysis function to the headline

df['sentiment'] = df['headline'].apply(calculate_sentiment)




In [None]:
df.head()

In [None]:
df[['headline', 'sentiment']]

In [None]:
df['sentiment'].describe()

In [None]:
df['headline'].isnull().sum()

In [None]:
# 
most_positive_headline = df.nlargest(5, 'sentiment')

most_positive_headline[['headline', 'sentiment']]



In [None]:
most_negative_headline = df.nsmallest(5, 'sentiment')
most_negative_headline[['headline', 'sentiment']]

In [None]:
neutral_headline = df[df['sentiment'] == 0.0]
neutral_headline[['headline', 'sentiment']]


Task 1

In [None]:
# textual lengths (like headline length).

df['headline_length'] = df['headline'].apply(len)
df['headline_length'].describe()

# count Article per publisher

publisher_counts = df['publisher'].value_counts()
publisher_counts.head()

# converting to date time

#df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')

# count articles by date

#df['date_only'] = df['date'].dt.date
articles_per_day = df['date'].value_counts().sort_index()

articles_per_day.plot(title="Articles Published Per Day", figsize=(10, 5))



Text Analysis(Topic Modeling)

In [None]:
# removing punctuations

df['headline'] = df['headline'].replace("[^a-zA-Z]", " ", regex=True)
df['headline'].head()
# converting to lower case
df['headline'] = df['headline'].str.lower()


In [None]:
# converting headline to lower case
new_Index = ['headline']  

for index in new_Index:
    df[index] = df[index].str.lower()
df['headline'].head()


In [None]:
# Define stop words
from wordcloud import STOPWORDS

stop_words = set(STOPWORDS)

# Join all headlines into one string
text = ' '.join(df['headline'].astype(str))

# Generate the word cloud
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)

# Plot it
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()




In [None]:


# # Download stopwords 
# nltk.download('stopwords')

# # Define stop words
# stop_words = set(stopwords.words('english'))

# # Join all headlines into one string
# text = ' '.join(df['headline'].astype(str))

# # Generate the word cloud
# wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)

# # Plot it
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()


In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load CSV
file_path = r"C:/Users/pc/Desktop/10 Academy/Week 1/sentiment-analysis/nootbooks/data/raw_analyst_ratings.csv"
df = pd.read_csv(file_path)

# Optional: drop rows with missing headlines
df = df.dropna(subset=['headline'])

# Start timer
start_time = time.time()

# Process each headline with spaCy
processed_docs = [nlp(headline) for headline in df['headline'].astype(str)]

# End timer
end_time = time.time()

# Show elapsed time
elapsed_time = end_time - start_time
print(f"Processing time for {len(processed_docs)} headlines: {elapsed_time:.2f} seconds")


In [None]:

df['publisher'] = pd.to_datetime(df['publisher'], errors='coerce')




In [None]:
df['date_only'] = df['publisher'].dt.date
daily_counts = df.groupby('date_only').size()


In [None]:
plt.figure(figsize=(12,6))
daily_counts.plot()
plt.title('Daily Article Publication Frequency')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()


In [None]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')


df['hour'] = df['date'].dt.hour
df['hour'].value_counts().sort_index().plot(kind='bar', figsize=(10,5))
plt.title("Articles by Hour of the Day")
plt.xlabel("Hour")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.show()


In [None]:
print(df['publisher'].isnull().sum())
print(df['publisher'].unique())


In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# Extract hour
df['hour'] = df['date'].dt.hour

# Count articles by hour
hourly_counts = df['hour'].value_counts().sort_index()
hourly_counts.plot(kind='bar', figsize=(10,5), color='skyblue')
plt.title("Articles by Hour of the Day")
plt.xlabel("Hour")
plt.ylabel("Number of Articles")
plt.show()


In [None]:

df = pd.read_csv((file_path), parse_dates=['date'])  # Only parse 'date' column as datetime
print(df['publisher'].head())  # Should show strings (publisher names)

print(df['publisher'].head())  # or use the correct column name from step 1
print(df['publisher'].value_counts())
print(df.columns)               # Check all column names
print(df['publisher'].head())  # Check first few rows of 'publisher' column
print(df['publisher'].value_counts())  # Check counts for unique publishers
print(df['publisher'].dtype)
print(df['publisher'].nunique())
print(df['publisher'].isnull().sum())



In [None]:
top_publisher = df['publisher'].value_counts().head(10)
top_publisher.plot(kind='bar', figsize=(10,5), color='skyblue')
plt.title("Top 10 Publishers")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.show()


In [None]:

sns.countplot(data=df, x='publisher', hue='stock')
plt.xticks(rotation=45)
plt.title("Number of Articles by Publisher by Stock")
plt.show()


In [None]:


df['domain'] = df['publisher'].str.extract(r'@([\w\.-]+)')
df['domain'].value_counts().head(10).plot(kind='bar', figsize=(10,5))
plt.title("Top Email Domains (Organizations)")
plt.xlabel("Domain")
plt.ylabel("Count")
plt.grid(True)
plt.show()


In [None]:

from keybert import KeyBERT

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(" ".join(df['headline'].dropna()), 
                                     keyphrase_ngram_range=(1, 2), 
                                     stop_words='english', 
                                     top_n=20)

for kw in keywords:
    print(kw)




In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df['headline'].dropna())

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

words = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([words[i] for i in topic.argsort()[:-11:-1]]))
