# ANALYSIS


### importing OS and sys

In [1]:
import os, sys

nbook_dir = os.getcwd()

# Add the parent directory of the 'scripts' folder to sys.path
sys.path.append(os.path.abspath(os.path.join(nbook_dir, '..')))

### importing my functions to use modularization

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


from scripts.data_loader import load_data
from scripts.data_loader import load_data_finance
from scripts.data_analysis import get_sentiment
from scripts.data_analysis import classify_sentiment
from scripts.data_analysis import calc_moving_avrg
from scripts.data_analysis import parse_mixed_dates
from scripts.data_visualization import plot_data
from scripts.data_visualization import plot_univariate
from scripts.data_visualization import plot_bivariate
from scripts.data_visualization import plot_top_publishers
from scripts.data_visualization import plot_top_keywords
from scripts.data_visualization import plot_publication_frequency
from scripts.data_visualization import plot_publishing_times




Loading the data

In [None]:

# Load the data
data_yfinance = load_data_finance("../yfinance_data/AAPL_historical_data.csv")
data = load_data("../raw_analyst_ratings.csv/raw_analyst_ratings.csv")

#print("Dataset Overview:")
#print(data.head())
#data.head()
print(data.columns)


#### Sentiment Analysis on Headlines

In [None]:

# Add a 'sentiment' column with sentiment scores
data['sentiment'] = data['headline'].apply(get_sentiment)

data['sentiment_category'] = data['sentiment'].apply(classify_sentiment)

# Group by Stock Symbol to get average sentiment per stock
stock_sentiment = data.groupby('stock')['sentiment'].mean().reset_index()
stock_sentiment = stock_sentiment.rename(columns={'sentiment': 'average_sentiment'})

# Display Results
print("\nSentiment Analysis Results:")
print(data[['headline', 'stock', 'sentiment', 'sentiment_category']].head())

print("\nAverage Sentiment per Stock:")
print(stock_sentiment)


In [None]:
# Count number of articles per publisher
publisher_counts = data['publisher'].value_counts()

print("Number of Articles Per Publisher:")
print(publisher_counts)

In [None]:


# Apply the function to the 'date' column
data['date'] = data['date'].apply(parse_mixed_dates)

# Extract day, month, and year after fixing the column
data['day_of_week'] = data['date'].dt.day_name()
data['month'] = data['date'].dt.month_name()
data['year'] = data['date'].dt.year

# Print the resulting DataFrame
print(data)

#### Extract Common Keywords/Phrases

In [None]:

# Tokenize and vectorize the 'headline' column
vectorizer = CountVectorizer(stop_words='english', max_features=20)
word_matrix = vectorizer.fit_transform(data['headline'])
word_counts = pd.DataFrame(word_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Sum word frequencies
word_frequencies = word_counts.sum(axis=0).sort_values(ascending=False)
print("Top Keywords in Headlines:")
print(word_frequencies)

plot_top_keywords(word_frequencies)



####  Topic Modeling with LDA (Latent Dirichlet Allocation)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['headline'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 5 topics
lda.fit(tfidf_matrix)

# Display the top words for each topic
words = tfidf_vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    print(f"Topic {i+1}: ", [words[i] for i in topic.argsort()[-10:]])


### Publication Frequency Over Time
#### Group and Count Articles by Date

In [None]:
# Convert 'date' column to datetime if not already
data['date'] = pd.to_datetime(data['date'], errors='coerce', utc=True)
data = data.dropna(subset=['date'])


# Group by date and count articles
daily_publications = data.groupby(data['date'].dt.date).size()
plot_publication_frequency(daily_publications)

#### Identify Spikes in Publication Frequency

In [None]:
print("Summary Statistics for Daily Publications:")
print(daily_publications.describe())

# Identify spikes (e.g., days with publication counts above the 95th percentile)
threshold = daily_publications.quantile(0.95)
spikes = daily_publications[daily_publications > threshold]
print("\nSpikes in Publication Frequency:")
print(spikes)


#### Publishing Times

In [None]:
data['hour'] = data['date'].dt.hour
hourly_publications = data['hour'].value_counts().sort_index()
plot_publishing_times(hourly_publications)

In [None]:
# Add a moving average column
data_yfinance["Moving Average"] = calc_moving_avrg(data_yfinance["Close"], window=5)

plot_data(data_yfinance)

In [None]:
### univariate analysis takes the data_yfinance and the column "close" and plots the distribution
plot_univariate(data_yfinance, 'Close', title="Closing price distribution")

### bivariate analysis takes the data_yfinance and the columns "close" and "volume"
plot_bivariate(data_yfinance, 'Close', 'Volume', title="Closing price VS volume traded")