<a href="https://colab.research.google.com/github/pintukumargithub/Project/blob/main/ai_sentiment_analysis_gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI Pipeline: Blue Sky Scraper + Gemini Flash Sentiment Analysis

### Libraries

In [None]:
import requests
import pandas as pd
import google.generativeai as genai
import enum
from typing_extensions import TypedDict
import json
import plotly.express as px

## 1. Configuration

### Authentication and API Keys

In [None]:
# Replace with your Bluesky handle and password
BLUESKY_HANDLE = 'handle goes here'
BLUESKY_PASSWORD = 'password goes here'

# Replace with your Google AI Studio API key
genai.configure(api_key='api key goes here')

### Gemini Model

In [None]:
model = genai.GenerativeModel("gemini-1.5-flash") # gemini-2.0-flash-exp

### Stock (or keyword to analyze)

In [None]:
search_term = 'ADBE'

### Number of posts to return

In [None]:
n = 100  # Number of latest posts to retrieve

## 2. Blue Sky Web Scraper

In [None]:
# Authenticate and obtain access token
auth_response = requests.post(
    'https://bsky.social/xrpc/com.atproto.server.createSession',
    json={'identifier': BLUESKY_HANDLE, 'password': BLUESKY_PASSWORD}
)
auth_response.raise_for_status()
access_token = auth_response.json().get('accessJwt')


In [None]:
# Set up the request headers with the access token
headers = {'Authorization': f'Bearer {access_token}'}

# Define the search parameters
params = {
    'q': search_term,
    'sort': 'latest',
    'limit': n
}

# Perform the search request
search_response = requests.get(
    'https://bsky.social/xrpc/app.bsky.feed.searchPosts',
    headers=headers,
    params=params
)
search_response.raise_for_status()
posts = search_response.json().get('posts', [])

In [None]:
# Extract data and create a list of dictionaries
data = []
for post in posts:
    author = post.get('author', {}).get('handle', 'Unknown')
    content = post.get('record', {}).get('text', 'No content')
    created_at = post.get('record', {}).get('createdAt', 'Unknown date')
    data.append({'Date': created_at, 'Content': content, 'Author': author})

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(data)

# Convert 'Date' column to datetime format for better handling
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [None]:
# Display the DataFrame
df[['Date','Content']]

Unnamed: 0,Date,Content
0,2024-12-31 19:10:07.207000+00:00,\n#MarjorieTaylorGreene Went Christmas Shoppin...
1,2024-12-31 17:39:40.015000+00:00,Over the past year #AJB and #ADBE swapped from...
2,2024-12-31 17:39:40.014000+00:00,The major changes to the port. over the year w...
3,2024-12-31 16:03:55.965884+00:00,"📊 ADBE Market Analysis - Dec 31, 2024\n\nCurre..."
4,2024-12-30 16:01:16.570000+00:00,"Adobe knows that #DEI is good for people, good..."
...,...,...
93,2024-12-12 17:19:22.999000+00:00,"Hello, Investors! 👋\nStocks were down modestly..."
94,2024-12-12 16:53:12.278000+00:00,Adobe posts record-breaking revenue 📈 but inve...
95,2024-12-12 16:20:44.597227+00:00,$ADBE Technical Analysis | Dec 12\nPrice: $549...
96,2024-12-12 15:49:51.063000+00:00,$ADBE: Adobe shares dropped 13% as its 2025 ou...


## 3. Google Gemini Sentiment Analysis

In [None]:
class Sentiment(enum.Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class AnalysisResult(TypedDict):
    is_stock_related: bool
    sentiment: Sentiment


In [None]:

def analyze_post(content: str) -> AnalysisResult:
    prompt = f"""
    Analyze the following post and determine:
    1. Whether it is related to the company, {search_term}, and relates to or discusses
        past, current, or future stock performance of {search_term} explicitly.
    2. If related, classify the sentiment as positive, negative, or neutral.

    Post: "{content}"
    """
    response = model.generate_content(
        prompt,
        generation_config=genai.GenerationConfig(
            response_mime_type="application/json",
            response_schema=AnalysisResult
        )
    )
    if response.candidates:
        candidate_content = response.candidates[0].content
        result_text = ''.join(part.text for part in candidate_content.parts)
        try:
            result = json.loads(result_text)
            is_stock_related = result.get('is_stock_related')
            sentiment = result.get('sentiment')
            if is_stock_related is not None and sentiment is not None:
                return is_stock_related, sentiment
            else:
                print("Missing expected keys in the response")
                return None, None
        except json.JSONDecodeError:
            print("Failed to decode JSON response")
            return None, None
    else:
        print("No candidates returned")
        return None, None


In [None]:
# Apply the analysis to each post
df[['is_stock_related', 'sentiment']] = df['Content'].apply(
    lambda x: pd.Series(analyze_post(x))
)

Missing expected keys in the response
Missing expected keys in the response


In [None]:
df.drop(columns='Author',inplace=True)
df

Unnamed: 0,Date,Content,is_stock_related,sentiment
0,2024-12-31 19:10:07.207000+00:00,\n#MarjorieTaylorGreene Went Christmas Shoppin...,True,neutral
1,2024-12-31 17:39:40.015000+00:00,Over the past year #AJB and #ADBE swapped from...,True,neutral
2,2024-12-31 17:39:40.014000+00:00,The major changes to the port. over the year w...,True,negative
3,2024-12-31 16:03:55.965884+00:00,"📊 ADBE Market Analysis - Dec 31, 2024\n\nCurre...",True,negative
4,2024-12-30 16:01:16.570000+00:00,"Adobe knows that #DEI is good for people, good...",True,positive
...,...,...,...,...
93,2024-12-12 17:19:22.999000+00:00,"Hello, Investors! 👋\nStocks were down modestly...",True,negative
94,2024-12-12 16:53:12.278000+00:00,Adobe posts record-breaking revenue 📈 but inve...,True,negative
95,2024-12-12 16:20:44.597227+00:00,$ADBE Technical Analysis | Dec 12\nPrice: $549...,True,positive
96,2024-12-12 15:49:51.063000+00:00,$ADBE: Adobe shares dropped 13% as its 2025 ou...,True,negative


In [None]:
# Filter out neutral sentiment
filtered_df = df[df['sentiment'] != 'neutral']

# Extract the date (day only) and calculate daily positive sentiment score
filtered_df['Day'] = filtered_df['Date'].dt.date
daily_sentiment = (
    filtered_df.groupby('Day')['sentiment']
    .apply(lambda x: (x == 'positive').sum() / len(x))
    .reset_index(name='positive_sentiment_score')
)

# Plot the daily sentiment score
fig = px.line(
    daily_sentiment,
    x='Day',
    y='positive_sentiment_score',
    title='Daily Positive Sentiment Score',
    labels={'positive_sentiment_score': 'Positive Sentiment Score', 'Day': 'Date'},
    markers=True,
)

fig.update_xaxes(dtick="D", tickformat="%Y-%m-%d")


fig



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

