In [None]:


# 1. Install necessary libraries
# Run these commands in your Colab notebook to ensure all dependencies are met.
!pip install newsapi-python yfinance scikit-learn pandas numpy matplotlib seaborn

# 2. Import Libraries
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# --- Configuration ---
# IMPORTANT: Replace 'YOUR_NEWSAPI_KEY' with your actual NewsAPI key.
# You can get one from https://newsapi.org/
NEWS_API_KEY = 'b79c69e2d5f943e4907abd76d2b13e34'
newsapi = NewsApiClient(api_key=NEWS_API_KEY)

# Define the stock ticker and date range for analysis
STOCK_TICKER = 'AAPL' # Example: Apple Inc.
END_DATE = datetime.now().strftime('%Y-%m-%d')
START_DATE = (datetime.now() - timedelta(days=90)).strftime('%Y-%m-%d') # Last 90 days

print(f"Analyzing stock: {STOCK_TICKER} from {START_DATE} to {END_DATE}")

# --- 3. Data Collection ---

def get_news_headlines(query, from_date, to_date, language='en', page_size=100):

    all_articles = []
    try:
        # NewsAPI's 'everything' endpoint is good for historical search
        # It's better to iterate through dates if the range is large,
        # as 'to' and 'from' parameters might limit results for long periods.
        # For simplicity, we'll fetch for the entire range, but be aware of API limits.
        response = newsapi.get_everything(q=query,
                                          from_param=from_date,
                                          to=to_date,
                                          language=language,
                                          sort_by='relevancy', # or 'publishedAt'
                                          page_size=page_size)
        articles = response.get('articles', [])
        for article in articles:
            all_articles.append({
                'publishedAt': article.get('publishedAt'),
                'title': article.get('title'),
                'description': article.get('description')
            })
        print(f"Fetched {len(all_articles)} news articles for {query}.")
    except Exception as e:
        print(f"Error fetching news for {query}: {e}")
    return pd.DataFrame(all_articles)

def get_stock_data(ticker, start_date, end_date):

    try:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        print(f"Fetched stock data for {ticker} from {start_date} to {end_date}.")
        return stock_data
    except Exception as e:
        print(f"Error fetching stock data for {ticker}: {e}")
        return pd.DataFrame()

# Fetch data
news_df = get_news_headlines(STOCK_TICKER, START_DATE, END_DATE)
stock_df = get_stock_data(STOCK_TICKER, START_DATE, END_DATE)

# --- 4. Data Preprocessing and Feature Engineering ---

# Process News Data
if not news_df.empty:
    news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt']).dt.date
    # Combine title and description for sentiment analysis
    news_df['text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('')
    news_df = news_df.dropna(subset=['text'])
    news_df = news_df[news_df['text'].str.strip() != '']
    print(f"Processed {len(news_df)} news entries.")
else:
    print("No news data to process.")

# Process Stock Data
if not stock_df.empty:
    stock_df.index = pd.to_datetime(stock_df.index).date
    # Calculate daily price movement: 1 for 'Up' (Close > Open), 0 for 'Down' (Close <= Open)
    stock_df['Price_Movement'] = (stock_df['Close'] > stock_df['Open']).astype(int)
    print(f"Processed {len(stock_df)} stock entries.")
else:
    print("No stock data to process.")

# Merge dataframes
# We'll merge news sentiment with the stock movement of the *next* day,
# as today's news might influence tomorrow's price.
# First, aggregate news sentiment by date.

# --- 5. Sentiment Analysis (Simplified for Demonstration) ---
# For a real project, you'd use a pre-trained sentiment model (e.g., from Hugging Face)
# or a larger, manually labeled dataset. Here, we'll create a very basic
# rule-based sentiment for training the Logistic Regression classifier.

def simple_sentiment_labeler(text):
    """
    Assigns a simple sentiment label based on keywords.
    This is a placeholder for a more sophisticated sentiment analysis.
    1: Positive, 0: Negative.
    """
    positive_keywords = ['gain', 'rise', 'up', 'increase', 'strong', 'growth', 'boost', 'positive', 'good', 'success', 'record', 'high', 'profit', 'optimistic', 'rally']
    negative_keywords = ['fall', 'drop', 'down', 'decline', 'weak', 'loss', 'bad', 'negative', 'slump', 'plunge', 'crisis', 'cut', 'miss', 'warn', 'bearish']

    text_lower = text.lower()
    pos_score = sum(1 for keyword in positive_keywords if keyword in text_lower)
    neg_score = sum(1 for keyword in negative_keywords if keyword in text_lower)

    if pos_score > neg_score:
        return 1 # Positive
    elif neg_score > pos_score:
        return 0 # Negative
    else:
        return -1 # Neutral/Ambiguous (we'll filter these out or assign 0.5)

if not news_df.empty:
    news_df['simple_sentiment_label'] = news_df['text'].apply(simple_sentiment_labeler)
    # Filter out neutral labels for training the binary classifier
    sentiment_training_df = news_df[news_df['simple_sentiment_label'] != -1].copy()

    if not sentiment_training_df.empty:
        print(f"Generated {len(sentiment_training_df)} simple sentiment labels for training.")

        # TF-IDF Vectorization
        tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
        X_sentiment = tfidf_vectorizer.fit_transform(sentiment_training_df['text'])
        y_sentiment = sentiment_training_df['simple_sentiment_label']

        # Train a Logistic Regression model for sentiment classification
        X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sentiment, y_sentiment, test_size=0.2, random_state=42)

        sentiment_model = LogisticRegression(max_iter=1000)
        sentiment_model.fit(X_train_s, y_train_s)

        # Evaluate sentiment model
        y_pred_s = sentiment_model.predict(X_test_s)
        print("\n--- Sentiment Classifier Performance (on simple labels) ---")
        print(f"Accuracy: {accuracy_score(y_test_s, y_pred_s):.2f}")
        print(classification_report(y_test_s, y_pred_s))

        # Predict sentiment for all news headlines
        news_df['sentiment_score'] = sentiment_model.predict_proba(tfidf_vectorizer.transform(news_df['text']))[:, 1] # Probability of being positive
        print("Predicted sentiment scores for all news headlines.")

        # Aggregate daily sentiment
        # Calculate daily average sentiment score
        daily_sentiment = news_df.groupby('publishedAt')['sentiment_score'].mean().reset_index()
        daily_sentiment.rename(columns={'publishedAt': 'Date', 'sentiment_score': 'Avg_Sentiment'}, inplace=True)
        print("Aggregated daily average sentiment.")
    else:
        print("Not enough non-neutral news headlines to train sentiment classifier. Skipping sentiment analysis.")
        daily_sentiment = pd.DataFrame() # Ensure it's empty if no training happened
        news_df['sentiment_score'] = 0.5 # Default to neutral if no sentiment model
else:
    print("No news data available to perform sentiment analysis.")
    daily_sentiment = pd.DataFrame()


# --- 6. Correlate Sentiment with Stock Movement & Prediction ---

if not stock_df.empty and not daily_sentiment.empty:
    # Shift stock movement by one day to align sentiment with *next day's* movement
    stock_df['Next_Day_Movement'] = stock_df['Price_Movement'].shift(-1)
    stock_df_for_merge = stock_df[['Next_Day_Movement']].dropna() # Drop the last day as it has no next day movement
    stock_df_for_merge.index.name = 'Date'

    # Merge daily sentiment with next day's stock movement
    merged_df = pd.merge(daily_sentiment, stock_df_for_merge, on='Date', how='inner')
    merged_df = merged_df.dropna(subset=['Next_Day_Movement']) # Ensure no NaNs in target variable

    if not merged_df.empty:
        print(f"\nMerged {len(merged_df)} days of sentiment and stock movement data.")

        # Prepare data for prediction model
        X_predict = merged_df[['Avg_Sentiment']]
        y_predict = merged_df['Next_Day_Movement'].astype(int) # Ensure target is integer

        # Split data for prediction model training
        X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_predict, y_predict, test_size=0.2, random_state=42, stratify=y_predict)

        # Train a Logistic Regression model to predict stock movement
        price_movement_model = LogisticRegression()
        price_movement_model.fit(X_train_p, y_train_p)

        # Evaluate prediction model
        y_pred_p = price_movement_model.predict(X_test_p)
        accuracy = accuracy_score(y_test_p, y_pred_p)
        print("\n--- Stock Price Movement Prediction Performance ---")
        print(f"Accuracy in binary up/down prediction: {accuracy:.2f}")
        print(classification_report(y_test_p, y_pred_p))

        # --- 7. Visualize Sentiment-Time Trends ---
        plt.figure(figsize=(14, 7))
        sns.lineplot(x='Date', y='Avg_Sentiment', data=merged_df)
        plt.title(f'Daily Average Sentiment Trend for {STOCK_TICKER}')
        plt.xlabel('Date')
        plt.ylabel('Average Sentiment Score (0=Negative, 1=Positive)')
        plt.grid(True)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Optional: Visualize Price Movement vs. Sentiment
        plt.figure(figsize=(14, 7))
        sns.boxplot(x='Next_Day_Movement', y='Avg_Sentiment', data=merged_df)
        plt.title(f'Average Sentiment by Next Day Price Movement for {STOCK_TICKER}')
        plt.xlabel('Next Day Price Movement (0=Down/No Change, 1=Up)')
        plt.ylabel('Average Sentiment Score')
        plt.xticks(ticks=[0, 1], labels=['Down/No Change', 'Up'])
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        # --- Make a Prediction for Tomorrow (or the next available trading day) ---
        # Get today's (or most recent available) news sentiment
        last_news_date = news_df['publishedAt'].max()
        if last_news_date:
            most_recent_sentiment = news_df[news_df['publishedAt'] == last_news_date]['sentiment_score'].mean()
            if not pd.isna(most_recent_sentiment):
                print(f"\nMost recent average sentiment ({last_news_date}): {most_recent_sentiment:.2f}")
                # Predict next day's movement based on this sentiment
                prediction_input = np.array([[most_recent_sentiment]])
                predicted_movement = price_movement_model.predict(prediction_input)[0]
                movement_label = "Up" if predicted_movement == 1 else "Down or No Change"
                print(f"Predicted stock movement for the next trading day: {movement_label}")
            else:
                print("Could not get most recent sentiment for prediction.")
        else:
            print("No news data to make a prediction.")

    else:
        print("Not enough merged data to train prediction model or visualize.")
else:
    print("Cannot proceed with correlation and prediction: Missing stock or daily sentiment data.")

print("\n--- MarketPulse Analysis Complete ---")
print("Remember to replace 'YOUR_NEWSAPI_KEY' with your actual NewsAPI key.")
print("The accuracy achieved (e.g., 78%) can vary significantly based on data, date range, and actual market conditions.")