EDA Baby!

In [26]:
import sqlite3
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

# --- Database Connection ---

# Construct the path relative to the notebook's expected location in the 'notebooks' folder
db_relative_path = os.path.join('..', 'data', 'licensing_data.db')
db_path = os.path.abspath(db_relative_path)

print(f"Attempting to connect to database at: {db_path}")

try:
    conn = sqlite3.connect(db_path)
    print("Database connection successful!")
except Exception as e:
    print(f"Error connecting to database: {e}")
    conn = None 

# --- Load Data into Pandas DataFrames ---

if conn:
    print("\nLoading data into DataFrames...")
    try:
        # Load Tweets 
        df_tweets = pd.read_sql_query("SELECT * FROM tweets", conn)
        print(f"Loaded {len(df_tweets)} tweets.")

        # Load Google Trends (Handle if empty/missing)
        try:
             df_trends = pd.read_sql_query("SELECT * FROM google_trends_data", conn)
             print(f"Loaded {len(df_trends)} Google Trends data points.")
        except (pd.io.sql.DatabaseError, Exception) as e:
             # Check specifically for "no such table" common after failed scrapes
             if "no such table: google_trends_data" in str(e).lower():
                 print("INFO: 'google_trends_data' table not found or empty. Proceeding without Google Trends.")
             else: # Print other errors but still create empty df
                  print(f"WARNING: Error loading Google Trends data: {e}. Proceeding without it.")
             df_trends = pd.DataFrame() # Create empty DataFrame

        # Load Products (joining with brands table)
        # Ensure the brands table was created and populated if needed by get_brand_id during scraping
        # If brands table might be empty or missing IDs, handle potential errors or adjust query
        try:
            # Assuming brands table exists and has matching IDs from scraping phase
            df_products = pd.read_sql_query("SELECT p.*, b.brand_name FROM products p LEFT JOIN brands b ON p.brand_id = b.id", conn)
            print(f"Loaded {len(df_products)} products.")
        except (pd.io.sql.DatabaseError, Exception) as e:
             print(f"ERROR loading products (check if 'brands' table exists and has IDs): {e}")
             df_products = pd.DataFrame() # Create empty if load fails

        # Close the connection 
        conn.close()
        print("\nDatabase connection closed.")

        # --- Initial Data Inspection ---
        print("\n--- Tweet Data Sample ---")
        # Display more columns if needed: pd.set_option('display.max_columns', None)
        print(df_tweets.head())
        print("\n--- Tweet Data Info ---")
        df_tweets.info()

        if not df_trends.empty:
            print("\n--- Google Trends Data Sample ---")
            print(df_trends.head())
            print("\n--- Google Trends Data Info ---")
            df_trends.info()
        else:
            print("\n--- Google Trends Data ---")
            print("(No data loaded)")

        if not df_products.empty:
            print("\n--- Product Data Sample ---")
            print(df_products.head())
            print("\n--- Product Data Info ---")
            df_products.info()
        else:
             print("\n--- Product Data ---")
             print("(No data loaded or error during load)")

    except Exception as e:
        print(f"Error during data loading or inspection: {e}")
        if conn:
            conn.close() 
            print("Database connection closed due to error.")
else:
    print("Cannot proceed without database connection.")

# Add this line at the end of Cell 1
if 'df_tweets' in locals() and not df_tweets.empty:
    print("\n--- Raw Tweet Date Samples ---")
    print(df_tweets['tweet_date'].head(20))

# Add/Modify these lines at the end of Cell 1
if 'df_tweets' in locals() and not df_tweets.empty:
    print("\n--- Inspecting Raw Tweet Dates ---")
    print("Data type of 'tweet_date' column:", df_tweets['tweet_date'].dtype)
    print("First 20 raw date string samples:")
    # Use .unique() to potentially see different formats if they exist
    unique_dates = df_tweets['tweet_date'].unique()
    print(unique_dates[:20]) 
    # Check if ALL are the same format (or if there are variations)
    if len(unique_dates) > 20:
        print("...")
        print("Last 5 unique raw date strings:")
        print(unique_dates[-5:])

Attempting to connect to database at: c:\Users\Nayef Alam\Desktop\ksa_licensing_model\data\licensing_data.db
Database connection successful!

Loading data into DataFrames...
Loaded 5852 tweets.
Loaded 0 Google Trends data points.
Loaded 209 products.

Database connection closed.

--- Tweet Data Sample ---
   id brand_name             tweet_id tweet_date username tweet_content  \
0   1   Fanatics  1981414626412941519             unknown                 
1   2   Fanatics  1981378630342111638             unknown                 
2   3   Fanatics  1981336097633218724             unknown                 
3   4   Fanatics  1980808490873430305             unknown                 
4   5   Fanatics  1980808431754780854             unknown                 

  language  reply_count  retweet_count  like_count  quote_count  
0       en            0              0           0            0  
1       en            0              0           0            0  
2       en            0              0      

In [27]:
import numpy as np
import re
import pandas as pd

print("--- Data Cleaning and Preprocessing ---")

# --- Clean df_tweets ---
print("\nCleaning Tweet Data...")

if 'df_tweets' in locals() and not df_tweets.empty:

    original_tweet_count = len(df_tweets)
    print(f"   Starting with {original_tweet_count} tweets.")

    # --- FIX V5: Attempt date parsing but DO NOT DROP ROWS if all fail ---
    print("   Attempting to parse dates (will keep rows even if parsing fails)...")
    date_strings = df_tweets['tweet_date'].astype(str).str.strip()
    parsed_dates = pd.to_datetime(date_strings, infer_datetime_format=True, errors='coerce')

    df_tweets['tweet_date'] = parsed_dates # Assign results (will contain NaT)

    parsed_count = df_tweets['tweet_date'].notna().sum()
    if parsed_count < original_tweet_count:
        print(f"   WARNING: {original_tweet_count - parsed_count} date formats could not be parsed and remain NaT.")
        # DO NOT DROP: # df_tweets.dropna(subset=['tweet_date'], inplace=True)
    else:
        print("   All tweet dates parsed successfully (or were already datetime).")

    # Ensure numeric columns are numeric (Keep as before)
    numeric_tweet_cols = ['reply_count', 'retweet_count', 'like_count', 'quote_count']
    for col in numeric_tweet_cols:
        df_tweets[col] = pd.to_numeric(df_tweets[col], errors='coerce').fillna(0).astype(int)

    # (Keep text cleaning function if needed)
    # def clean_text(text): ...

    print("Tweet Data Cleaned:")
    df_tweets.info() # Should show 5852 entries, but 0 non-null for tweet_date


else:
    print("   WARNING: df_tweets is empty or does not exist before cleaning.")

# --- Clean df_products (Keep as before) ---
print("\nCleaning Product Data...")
if 'df_products' in locals() and not df_products.empty:
    numeric_product_cols = ['price', 'avg_rating', 'num_reviews']
    for col in numeric_product_cols:
        df_products[col] = pd.to_numeric(df_products[col], errors='coerce')

    df_products['avg_rating'] = df_products['avg_rating'].fillna(0)
    df_products['num_reviews'] = df_products['num_reviews'].fillna(0).astype(int)

    print("Product Data Cleaned:")
    df_products.info()
else:
    print("   Product DataFrame is empty or does not exist, skipping cleaning.")

print("\nCleaning Complete ---")

--- Data Cleaning and Preprocessing ---

Cleaning Tweet Data...
   Starting with 5852 tweets.
   Attempting to parse dates (will keep rows even if parsing fails)...
Tweet Data Cleaned:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5852 entries, 0 to 5851
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             5852 non-null   int64         
 1   brand_name     5852 non-null   object        
 2   tweet_id       5852 non-null   object        
 3   tweet_date     0 non-null      datetime64[ns]
 4   username       5852 non-null   object        
 5   tweet_content  5852 non-null   object        
 6   language       5852 non-null   object        
 7   reply_count    5852 non-null   int64         
 8   retweet_count  5852 non-null   int64         
 9   like_count     5852 non-null   int64         
 10  quote_count    5852 non-null   int64         
dtypes: datetime64[ns](1), int64(5), object(5

  parsed_dates = pd.to_datetime(date_strings, infer_datetime_format=True, errors='coerce')


In [30]:
import numpy as np
import pandas as pd # Ensure pandas is imported

print("--- Feature Engineering ---")

# --- Calculate Tweet Volume per Brand ---
print("\nCalculating Tweet Volume per Brand...")

# Check if df_tweets exists and has data
if 'df_tweets' in locals() and not df_tweets.empty and 'brand_name' in df_tweets.columns:
    # --- FIX V6: Group tweets by brand and count them ---
    df_tweet_volume = df_tweets.groupby('brand_name').size().reset_index(name='tweet_volume')
    
    print("   Created 'df_tweet_volume' DataFrame.")
    print(df_tweet_volume.head())
else:
     print("   WARNING: Cannot calculate tweet volume. df_tweets is empty or missing 'brand_name'.")
     df_tweet_volume = pd.DataFrame(columns=['brand_name', 'tweet_volume'])


# --- Calculate Brand Metrics from Products ---
# (Keep this part exactly the same as before)
print("\nCalculating Brand Metrics from Product Data...")
if 'df_products' in locals() and not df_products.empty:
    if 'brand_name' in df_products.columns and 'product_name' in df_products.columns and \
       'avg_rating' in df_products.columns and 'num_reviews' in df_products.columns:
        
        df_brand_metrics = df_products.groupby('brand_name').agg(
            market_saturation=('product_name', 'count'),  
            avg_perceived_quality=('avg_rating', 'mean'), 
            avg_num_reviews=('num_reviews', 'mean')       
        ).reset_index()
        # Round the averages for cleaner display
        df_brand_metrics['avg_perceived_quality'] = df_brand_metrics['avg_perceived_quality'].round(2)
        df_brand_metrics['avg_num_reviews'] = df_brand_metrics['avg_num_reviews'].round(2)
        print("   Created 'df_brand_metrics' DataFrame.")
        print(df_brand_metrics.head())
    else:
        print("   ERROR: Required columns missing in df_products. Cannot calculate brand metrics.")
        df_brand_metrics = pd.DataFrame(columns=['brand_name', 'market_saturation', 'avg_perceived_quality', 'avg_num_reviews'])
else:
    print("   Product DataFrame is empty or does not exist, skipping brand metrics calculation.")
    df_brand_metrics = pd.DataFrame(columns=['brand_name', 'market_saturation', 'avg_perceived_quality', 'avg_num_reviews'])

# --- Combine Metrics into One DataFrame ---
print("\nCombining all metrics...")

# Ensure both dataframes exist before merging
if 'df_tweet_volume' in locals() and 'df_brand_metrics' in locals():
    # Merge tweet volume with product metrics using brand_name
    df_combined_metrics = pd.merge(df_tweet_volume, df_brand_metrics, on='brand_name', how='outer')

    # Fill NaN values that result from the merge 
    df_combined_metrics['tweet_volume'] = df_combined_metrics['tweet_volume'].fillna(0).astype(int) 
    df_combined_metrics['market_saturation'] = df_combined_metrics['market_saturation'].fillna(0).astype(int)
    df_combined_metrics['avg_perceived_quality'] = df_combined_metrics['avg_perceived_quality'].fillna(0)
    df_combined_metrics['avg_num_reviews'] = df_combined_metrics['avg_num_reviews'].fillna(0)
     
    print("   Created 'df_combined_metrics' DataFrame.")
    # Sort by tweet_volume descending for better initial view
    print(df_combined_metrics.sort_values(by='tweet_volume', ascending=False)) 
else:
    print("   ERROR: Could not combine metrics because one or both required DataFrames are missing.")
    df_combined_metrics = pd.DataFrame() # Create empty if merge fails

print("\n--- Feature Engineering Complete ---")

--- Feature Engineering ---

Calculating Tweet Volume per Brand...
   Created 'df_tweet_volume' DataFrame.
      brand_name  tweet_volume
0       Al-Hilal          1028
1       Al-Nassr           958
2       Fanatics           502
3      KSA Anime           103
4  KSA One Piece            45

Calculating Brand Metrics from Product Data...
   Created 'df_brand_metrics' DataFrame.
      brand_name  market_saturation  avg_perceived_quality  avg_num_reviews
0       Al-Hilal                 25                   1.38             0.84
1       Al-Nassr                 24                   1.35             0.75
2       Fanatics                 25                   3.12             8.60
3  KSA One Piece                 25                   2.14             4.12
4        Lazurde                 25                   3.45           294.92

Combining all metrics...
   Created 'df_combined_metrics' DataFrame.
             brand_name  tweet_volume  market_saturation  \
0              Al-Hilal         

In [29]:
# --- Check Raw Engagement Counts ---
print("\n--- Checking Raw Engagement Counts from DB ---")
conn = None # Ensure conn is defined
try:
    conn = sqlite3.connect(db_path) # db_path should still be defined from Cell 1
    df_check = pd.read_sql_query("""
        SELECT 
            brand_name, 
            SUM(like_count) as total_likes, 
            SUM(retweet_count) as total_retweets,
            COUNT(*) as num_tweets
        FROM tweets 
        GROUP BY brand_name
        ORDER BY total_likes DESC
        LIMIT 15 
    """, conn)
    
    print("Sample sums of raw engagement counts per brand:")
    print(df_check)
    
    conn.close()

except Exception as e:
    print(f"Error checking raw counts: {e}")
    if conn:
        conn.close()


--- Checking Raw Engagement Counts from DB ---
Sample sums of raw engagement counts per brand:
            brand_name  total_likes  total_retweets  num_tweets
0  Vacheron Constantin            0               0           8
1         Saudi Aramco            0               0        1008
2        Riyadh Season            0               0         625
3                  PIF            0               0         560
4              Lazurde            0               0        1015
5        KSA One Piece            0               0          45
6            KSA Anime            0               0         103
7             Fanatics            0               0         502
8             Al-Nassr            0               0         958
9             Al-Hilal            0               0        1028


In [35]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # For potential log scale
import os # For saving plots

print("--- Data Visualization (Refined) ---")

# Ensure the combined metrics DataFrame exists and isn't empty
if 'df_combined_metrics' in locals() and not df_combined_metrics.empty:

    # --- Plot 1: Tweet Volume (Hype Proxy) ---
    print("\nGenerating Plot 1: Tweet Volume by Brand...")
    plt.figure(figsize=(12, 7)) 
    df_sorted_hype = df_combined_metrics.sort_values('tweet_volume', ascending=False)
    # Filter out brands with 0 tweet volume if desired for cleaner plot
    # df_sorted_hype = df_sorted_hype[df_sorted_hype['tweet_volume'] > 0] 
    sns.barplot(data=df_sorted_hype, y='brand_name', x='tweet_volume', palette='viridis')
    plt.title('Tweet Volume (Hype Proxy) by Brand')
    plt.xlabel('Total Tweets Collected')
    plt.ylabel('Brand')
    plt.tight_layout() 
    plt.savefig(os.path.join('..', 'tweet_volume_by_brand_v2.png')) 
    print("   Saved plot as tweet_volume_by_brand_v2.png")
    plt.close() 


    # --- Plot 2: Market Saturation ---
    print("\nGenerating Plot 2: Market Saturation (Amazon Products) by Brand...")
    df_saturated = df_combined_metrics[df_combined_metrics['market_saturation'] > 0]
    if not df_saturated.empty:
        plt.figure(figsize=(12, 7))
        df_sorted_saturation = df_saturated.sort_values('market_saturation', ascending=False)
        sns.barplot(data=df_sorted_saturation, y='brand_name', x='market_saturation', palette='magma')
        plt.title('Market Saturation (Amazon Products Found) by Brand')
        plt.xlabel('Number of Products Found (Max 25)')
        plt.ylabel('Brand')
        plt.tight_layout()
        plt.savefig(os.path.join('..', 'market_saturation_by_brand_v2.png'))
        print("   Saved plot as market_saturation_by_brand_v2.png")
        plt.close()
    else:
        print("   Skipping Market Saturation plot: No brands with products found.")


    # --- Plot 3: Hype vs. Saturation Scatter Plot ---
    print("\nGenerating Plot 3: Tweet Volume vs. Market Saturation...")
    plt.figure(figsize=(10, 8))
    # Use only brands with BOTH tweets and products for this plot for clearer comparison
    df_plot_data = df_combined_metrics[(df_combined_metrics['market_saturation'] > 0) & (df_combined_metrics['tweet_volume'] > 0)].copy()

    if not df_plot_data.empty:
        scatter = sns.scatterplot(
            data=df_plot_data, 
            x='market_saturation', 
            y='tweet_volume', 
            hue='avg_perceived_quality', # Color points by average rating
            size='avg_num_reviews',      # Size points by average number of reviews
            sizes=(50, 500),             
            palette='coolwarm',          
            legend='auto' 
        )
        
        # Add labels 
        for i in range(df_plot_data.shape[0]):
            plt.text(
                x=df_plot_data['market_saturation'].iloc[i] + 0.1, 
                y=df_plot_data['tweet_volume'].iloc[i] + 5, 
                s=df_plot_data['brand_name'].iloc[i], 
                fontdict=dict(color='black', size=9)
            )

        plt.title('Hype (Tweet Volume) vs. Market Saturation')
        plt.xlabel('Market Saturation (Amazon Products)')
        plt.ylabel('Tweet Volume (Total Tweets)')
        
        # Add quadrant lines at median values for context
        median_hype = df_plot_data['tweet_volume'].median()
        median_saturation = df_plot_data['market_saturation'].median()
        plt.axhline(median_hype, color='grey', linestyle='--', linewidth=0.8)
        plt.axvline(median_saturation, color='grey', linestyle='--', linewidth=0.8)
        # Add text labels for quadrants if desired (example)
        # plt.text(median_saturation * 1.1, median_hype * 1.1, "High Hype, High Saturation", color='grey')
        # plt.text(median_saturation * 0.9, median_hype * 1.1, "High Hype, Low Saturation", color='grey', ha='right')
        # ... etc for other quadrants ...
        
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
        plt.tight_layout(rect=[0, 0, 0.85, 1]) 
        plt.savefig(os.path.join('..', 'hype_vs_saturation_v2.png'))
        print("   Saved plot as hype_vs_saturation_v2.png")
        plt.close()
    else:
         print("   Skipping Hype vs. Saturation plot: No brands with both products and tweets found.")


    print("\n--- Visualization Complete ---")

else:
    print("   ERROR: 'df_combined_metrics' DataFrame not found or empty. Cannot create plots.")

--- Data Visualization (Refined) ---

Generating Plot 1: Tweet Volume by Brand...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_sorted_hype, y='brand_name', x='tweet_volume', palette='viridis')


   Saved plot as tweet_volume_by_brand_v2.png

Generating Plot 2: Market Saturation (Amazon Products) by Brand...
   Saved plot as market_saturation_by_brand_v2.png

Generating Plot 3: Tweet Volume vs. Market Saturation...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_sorted_saturation, y='brand_name', x='market_saturation', palette='magma')


   Saved plot as hype_vs_saturation_v2.png

--- Visualization Complete ---


In [36]:
# --- Save Final Metrics to CSV ---
print("\n--- Saving Combined Metrics ---")

if 'df_combined_metrics' in locals() and not df_combined_metrics.empty:
    csv_path_relative = os.path.join('..', 'data', 'brand_metrics_final.csv')
    csv_path = os.path.abspath(csv_path_relative)
    try:
        df_combined_metrics.to_csv(csv_path, index=False)
        print(f"   Successfully saved combined metrics to: {csv_path}")
    except Exception as e:
        print(f"   Error saving metrics to CSV: {e}")
else:
    print("   ERROR: 'df_combined_metrics' not found or empty. Cannot save.")


--- Saving Combined Metrics ---
   Successfully saved combined metrics to: c:\Users\Nayef Alam\Desktop\ksa_licensing_model\data\brand_metrics_final.csv


Sentiment Analysis (Did not work)

In [32]:
%pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [33]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd # Ensure pandas is imported

print("--- Phase 3a: Tweet Sentiment Analysis ---")

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get VADER sentiment score (specifically the compound score)
def get_vader_sentiment(text):
    if isinstance(text, str):
        # VADER's polarity_scores returns dict: {neg, neu, pos, compound}
        # Compound score is a normalized score between -1 (most negative) and +1 (most positive)
        return analyzer.polarity_scores(text)['compound']
    return 0 # Return neutral score for non-string data (like potential NaN)

# Check if df_tweets exists and has the content column
if 'df_tweets' in locals() and not df_tweets.empty and 'tweet_content' in df_tweets.columns:
    print("\nCalculating sentiment scores for tweets...")
    # Apply the function to the 'tweet_content' column
    # Create a new column 'sentiment_score'
    df_tweets['sentiment_score'] = df_tweets['tweet_content'].apply(get_vader_sentiment)
    
    print("   Added 'sentiment_score' column.")
    # Display sample results including the new score
    print(df_tweets[['brand_name', 'tweet_content', 'sentiment_score']].head())

    # --- Calculate Average Sentiment per Brand ---
    print("\nCalculating average sentiment per brand...")
    # Group by brand_name and calculate the mean of the sentiment scores
    df_avg_sentiment = df_tweets.groupby('brand_name')['sentiment_score'].mean().reset_index()
    df_avg_sentiment.rename(columns={'sentiment_score': 'avg_tweet_sentiment'}, inplace=True)
    df_avg_sentiment['avg_tweet_sentiment'] = df_avg_sentiment['avg_tweet_sentiment'].round(3) # Round for readability
    
    print("   Created 'df_avg_sentiment' DataFrame.")
    print(df_avg_sentiment.head())

    # --- Merge Average Sentiment into Combined Metrics ---
    print("\nMerging average sentiment into combined metrics...")
    if 'df_combined_metrics' in locals() and not df_combined_metrics.empty:
        df_combined_metrics = pd.merge(df_combined_metrics, df_avg_sentiment, on='brand_name', how='left')
        # Fill potential NaN if a brand had no tweets to analyze
        df_combined_metrics['avg_tweet_sentiment'] = df_combined_metrics['avg_tweet_sentiment'].fillna(0) 
        
        print("   Updated 'df_combined_metrics' DataFrame:")
        # Display the updated combined metrics, sorted by sentiment
        print(df_combined_metrics.sort_values(by='avg_tweet_sentiment', ascending=False))
    else:
        print("   ERROR: 'df_combined_metrics' not found. Cannot merge sentiment.")
        
else:
    print("   WARNING: df_tweets is empty or missing 'tweet_content'. Cannot perform sentiment analysis.")

print("\n--- Sentiment Analysis Complete ---")

--- Phase 3a: Tweet Sentiment Analysis ---

Calculating sentiment scores for tweets...
   Added 'sentiment_score' column.
  brand_name tweet_content  sentiment_score
0   Fanatics                            0.0
1   Fanatics                            0.0
2   Fanatics                            0.0
3   Fanatics                            0.0
4   Fanatics                            0.0

Calculating average sentiment per brand...
   Created 'df_avg_sentiment' DataFrame.
      brand_name  avg_tweet_sentiment
0       Al-Hilal                  0.0
1       Al-Nassr                  0.0
2       Fanatics                  0.0
3      KSA Anime                  0.0
4  KSA One Piece                  0.0

Merging average sentiment into combined metrics...
   Updated 'df_combined_metrics' DataFrame:
             brand_name  tweet_volume  market_saturation  \
0              Al-Hilal          1028                 25   
1              Al-Nassr           958                 24   
2              Fanatics  

In [34]:
print("--- Inspecting Tweet Content and Sentiment Scores ---")

if 'df_tweets' in locals() and not df_tweets.empty and 'sentiment_score' in df_tweets.columns:
    # Display some tweets and their scores
    # Filter for potentially non-neutral tweets if possible (though scores are 0 now)
    # Let's just look at a random sample
    print("Random sample of tweets and their calculated sentiment scores:")
    print(df_tweets[['brand_name', 'tweet_content', 'sentiment_score']].sample(15))

    # Check distribution of scores (will likely be all 0)
    print("\nValue counts for sentiment_score:")
    print(df_tweets['sentiment_score'].value_counts())
else:
    print("df_tweets is empty or missing sentiment_score column.")

--- Inspecting Tweet Content and Sentiment Scores ---
Random sample of tweets and their calculated sentiment scores:
        brand_name tweet_content  sentiment_score
2366  Saudi Aramco                            0.0
4330      Al-Hilal                            0.0
1935           PIF                            0.0
4347      Al-Hilal                            0.0
4984      Al-Nassr                            0.0
1797           PIF                            0.0
4419      Al-Hilal                            0.0
157       Fanatics                            0.0
1000       Lazurde                            0.0
1227       Lazurde                            0.0
1376       Lazurde                            0.0
1869           PIF                            0.0
3793      Al-Hilal                            0.0
80        Fanatics                            0.0
2054           PIF                            0.0

Value counts for sentiment_score:
sentiment_score
0.0    5852
Name: count, dtype: 