# Dataset Preparation

In [None]:
import pandas as pd
import numpy as np
from prophet import Prophet
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import matplotlib.pyplot as plt
import torch
import spacy
from R_files.data_handler import DataCleaner

In [None]:
df = pd.read_csv('trend_analysis.csv')

In [None]:
cleaner = DataCleaner(df)
df = cleaner.drop_rows((100000,547483))

In [None]:
df[-4:]

In [None]:
df[-100:]

# Preparing data and ploting

In [None]:
df["date"] = pd.to_datetime(df["date"])

In [None]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

In [None]:
nlp = spacy.load("en_core_web_sm")
brands = set()

In [None]:
predefined_brands = ["nike", "samsung", "google", "microsoft", "amazon"]

In [None]:
predefined_brands = {brand.lower() for brand in predefined_brands}


In [None]:
brand_counts = {brand: 0 for brand in predefined_brands}

for tweet in df["tweets"]:
    # Process the tweet with spaCy
    doc = nlp(tweet)
    
    # Check for named entities that are organizations or products
    tweet_entities = {ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]}
    
    # Count brands mentioned in the tweet
    for brand in predefined_brands:
        if brand in tweet_entities or brand in tweet.lower():
            brand_counts[brand] += 1

In [None]:
for brand, count in brand_counts.items():
    print(f"{brand}: {count} mentions")

In [None]:
brand_trends = []
for brand in predefined_brands:
	#Filter tweets mentioning the brand
	brand_df = df[df["tweets"].str.contains(brand, case=False, na=False)]
	if brand_df.empty:
		continue

	#Aggregate daily engagement metrics
	daily_engagement = brand_df.groupby(pd.Grouper(key="date", freq="D")).agg({
		"likeCount" : "sum",
		"retweetCount" :"sum",
		"viewCount" : "sum",
		"replyCount" : "sum"
	}).reset_index()

	#Compute composite engagement score
	daily_engagement["engagement_score"] = (
		daily_engagement["likeCount"] * 1 +
		daily_engagement["retweetCount"] * 3 +
		daily_engagement["replyCount"] * 2 +
		(daily_engagement["viewCount"] / 1000) * 0.5
	)

		#Prepare data for prophet 
	prophet_df = daily_engagement[["date", "engagement_score"]].copy()
	prophet_df.columns = ["ds", "y"]
	prophet_df["brand"] = brand

	prophet_df["ds"] = prophet_df["ds"].dt.tz_localize(None)

	brand_trends.append(prophet_df)

In [None]:
all_brand_trends = pd.concat(brand_trends, ignore_index=True)

In [None]:
all_brand_trends

# Model 

In [None]:
all_forecasts = []
for brand in all_brand_trends["brand"].unique():
	brand_data = all_brand_trends[all_brand_trends["brand"] == brand] [["ds", "y"]]

	model = Prophet(
		yearly_seasonality=True,
		weekly_seasonality=True,
		daily_seasonality=False,
		changepoint_prior_scale=0.05,
		seasonality_prior_scale=10,
	)
	model.fit(brand_data)

	#generate future prdictions
	future_dates = model.make_future_dataframe(periods=30)
	forecast = model.predict(future_dates)
	forecast["brand"] = brand
	all_forecasts.append(forecast)

In [None]:
all_forecast_data = pd.concat(all_forecasts, ignore_index=True)

# Visualize trends of specific brand

In [None]:
# Add a marker to distinguish between actual and forecasted data
all_forecast_data["type"] = "forecasted"
all_forecast_data.loc[all_forecast_data["ds"].isin(all_brand_trends["ds"]), "type"] = "actual"

# Visualize trends with date range selector
fig = px.line(
    all_forecast_data,
    x="ds",
    y="yhat",
    color="brand",
    line_dash="type",  # Differentiates between actual and forecasted data
    title="Trend Forecast for All Brands",
    labels={
        "ds": "Date",
        "yhat": "Engagement",
        "brand": "Brand",
        "type": "Data Type",
    },
)

# Update layout to include a date range slider and dropdown
fig.update_layout(
    xaxis=dict(
        title="Date",
        rangeselector=dict(
            buttons=[
                {"count": 7, "label": "Last 7 Days", "step": "day", "stepmode": "backward"},
                {"count": 30, "label": "Last 30 Days", "step": "day", "stepmode": "backward"},
                {"count": 90, "label": "Last 90 Days", "step": "day", "stepmode": "backward"},
                {"step": "all", "label": "All Time"}
            ]
        ),
        rangeslider=dict(visible=True),  # Add a range slider below the graph
        type="date",
    ),
    yaxis_title="Engagement",
    legend_title="Brand",
    template="plotly_white",
)

fig.show()


In [None]:
print("\nTrending Brands:")

for brand in all_forecast_data["brand"].unique():
	brand_forecast = all_forecast_data[all_forecast_data["brand"] == brand]
	
    #Calulate trend strength over the last 30 days predicted days
	last_30_days = brand_forecast.tail(30)
	overall_mean = brand_forecast["yhat"].mean()
	trend_strength = (last_30_days["yhat"].mean() - overall_mean) / brand_forecast["yhat"].std()
	print(f"{brand.capitalize()}: {trend_strength:.2f}")