# Dataset Preparation

In [1]:
import pandas as pd
import numpy as np
from prophet import Prophet
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import matplotlib.pyplot as plt
import torch
import spacy
from R_files.data_handler import DataCleaner



In [2]:
df = pd.read_csv('trend_analysis.csv')

  df = pd.read_csv('trend_analysis.csv')


In [3]:
cleaner = DataCleaner(df)
df = cleaner.drop_rows((100000,547483))

In [4]:
df[-4:]

Unnamed: 0,date,likeCount,replyCount,retweetCount,viewCount,followersCount,tweets
99996,2022-11-05 23:59:41+00:00,18.0,0.0,6,2077.480637,12529,live adam \n\n come watch \n
99997,2022-11-05 23:59:41+00:00,6.0,0.0,0,714.405037,615,franz score 31 point tonight pretty much layup...
99998,2022-11-05 23:59:41+00:00,20.0,4.0,6,1851.28308,203,officially one week away big charity stream hu...
99999,2022-11-05 23:59:41+00:00,1.0,0.0,0,191.910011,730,beautiful celebration long beach 7th annual ...


In [5]:
df[-100:]

Unnamed: 0,date,likeCount,replyCount,retweetCount,viewCount,followersCount,tweets
99900,2022-11-05 23:59:43+00:00,186.0,12.0,23,13450.365050,1043,come across pic rob tumblr need moment
99901,2022-11-05 23:59:43+00:00,2.0,0.0,0,302.782800,10143,show fuck nonsense can not believe flash dri...
99902,2022-11-05 23:59:43+00:00,2.0,1.0,0,302.782800,44759,boy back
99903,2022-11-05 23:59:43+00:00,2.0,0.0,0,302.782800,660,date day much need today
99904,2022-11-05 23:59:43+00:00,16.0,0.0,1,1902.447461,92444,fire \n\n
...,...,...,...,...,...,...,...
99995,2022-11-05 23:59:41+00:00,4.0,0.0,0,462.916948,114,10 week
99996,2022-11-05 23:59:41+00:00,18.0,0.0,6,2077.480637,12529,live adam \n\n come watch \n
99997,2022-11-05 23:59:41+00:00,6.0,0.0,0,714.405037,615,franz score 31 point tonight pretty much layup...
99998,2022-11-05 23:59:41+00:00,20.0,4.0,6,1851.283080,203,officially one week away big charity stream hu...


# Preparing data and ploting

In [6]:
df["date"] = pd.to_datetime(df["date"])

In [7]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

In [8]:
nlp = spacy.load("en_core_web_sm")
brands = set()



In [9]:
predefined_brands = ["nike", "samsung", "google", "microsoft", "amazon"]

In [10]:
predefined_brands = {brand.lower() for brand in predefined_brands}


In [11]:
brand_counts = {brand: 0 for brand in predefined_brands}

for tweet in df["tweets"]:
    # Process the tweet with spaCy
    doc = nlp(tweet)
    
    # Check for named entities that are organizations or products
    tweet_entities = {ent.text.lower() for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]}
    
    # Count brands mentioned in the tweet
    for brand in predefined_brands:
        if brand in tweet_entities or brand in tweet.lower():
            brand_counts[brand] += 1

In [12]:
for brand, count in brand_counts.items():
    print(f"{brand}: {count} mentions")

nike: 23 mentions
google: 110 mentions
amazon: 86 mentions
samsung: 17 mentions
microsoft: 26 mentions


In [13]:
brand_trends = []
for brand in predefined_brands:
	#Filter tweets mentioning the brand
	brand_df = df[df["tweets"].str.contains(brand, case=False, na=False)]
	if brand_df.empty:
		continue

	#Aggregate daily engagement metrics
	daily_engagement = brand_df.groupby(pd.Grouper(key="date", freq="D")).agg({
		"likeCount" : "sum",
		"retweetCount" :"sum",
		"viewCount" : "sum",
		"replyCount" : "sum"
	}).reset_index()

	#Compute composite engagement score
	daily_engagement["engagement_score"] = (
		daily_engagement["likeCount"] * 1 +
		daily_engagement["retweetCount"] * 3 +
		daily_engagement["replyCount"] * 2 +
		(daily_engagement["viewCount"] / 1000) * 0.5
	)

		#Prepare data for prophet 
	prophet_df = daily_engagement[["date", "engagement_score"]].copy()
	prophet_df.columns = ["ds", "y"]
	prophet_df["brand"] = brand

	prophet_df["ds"] = prophet_df["ds"].dt.tz_localize(None)

	brand_trends.append(prophet_df)

In [14]:
all_brand_trends = pd.concat(brand_trends, ignore_index=True)

In [15]:
all_brand_trends

Unnamed: 0,ds,y,brand
0,2022-01-02,1.095955,nike
1,2022-01-03,0.0,nike
2,2022-01-04,44.464157,nike
3,2022-01-05,193.845795,nike
4,2022-01-06,0.0,nike
...,...,...,...
1529,2022-10-27,0.0,microsoft
1530,2022-10-28,0.0,microsoft
1531,2022-10-29,0.0,microsoft
1532,2022-10-30,0.0,microsoft


# Model 

In [16]:
all_forecasts = []
for brand in all_brand_trends["brand"].unique():
	brand_data = all_brand_trends[all_brand_trends["brand"] == brand] [["ds", "y"]]

	model = Prophet(
		yearly_seasonality=True,
		weekly_seasonality=True,
		daily_seasonality=False,
		changepoint_prior_scale=0.05,
		seasonality_prior_scale=10,
	)
	model.fit(brand_data)

	#generate future prdictions
	future_dates = model.make_future_dataframe(periods=30)
	forecast = model.predict(future_dates)
	forecast["brand"] = brand
	all_forecasts.append(forecast)

16:15:56 - cmdstanpy - INFO - Chain [1] start processing
16:15:57 - cmdstanpy - INFO - Chain [1] done processing
16:15:57 - cmdstanpy - INFO - Chain [1] start processing
16:15:57 - cmdstanpy - INFO - Chain [1] done processing
16:15:57 - cmdstanpy - INFO - Chain [1] start processing
16:15:57 - cmdstanpy - INFO - Chain [1] done processing
16:15:57 - cmdstanpy - INFO - Chain [1] start processing
16:15:57 - cmdstanpy - INFO - Chain [1] done processing
16:15:57 - cmdstanpy - INFO - Chain [1] start processing
16:15:57 - cmdstanpy - INFO - Chain [1] done processing


In [21]:
all_forecast_data = pd.concat(all_forecasts, ignore_index=True)

# Visualize trends of specific brand

In [23]:
# Add a marker to distinguish between actual and forecasted data
all_forecast_data["type"] = "forecasted"
all_forecast_data.loc[all_forecast_data["ds"].isin(all_brand_trends["ds"]), "type"] = "actual"

# Visualize trends with date range selector
fig = px.line(
    all_forecast_data,
    x="ds",
    y="yhat",
    color="brand",
    line_dash="type",  # Differentiates between actual and forecasted data
    title="Trend Forecast for All Brands",
    labels={
        "ds": "Date",
        "yhat": "Engagement",
        "brand": "Brand",
        "type": "Data Type",
    },
)

# Update layout to include a date range slider and dropdown
fig.update_layout(
    xaxis=dict(
        title="Date",
        rangeselector=dict(
            buttons=[
                {"count": 7, "label": "Last 7 Days", "step": "day", "stepmode": "backward"},
                {"count": 30, "label": "Last 30 Days", "step": "day", "stepmode": "backward"},
                {"count": 90, "label": "Last 90 Days", "step": "day", "stepmode": "backward"},
                {"step": "all", "label": "All Time"}
            ]
        ),
        rangeslider=dict(visible=True),  # Add a range slider below the graph
        type="date",
    ),
    yaxis_title="Engagement",
    legend_title="Brand",
    template="plotly_white",
)

fig.show()


In [24]:
print("\nTrending Brands:")

for brand in all_forecast_data["brand"].unique():
	brand_forecast = all_forecast_data[all_forecast_data["brand"] == brand]
	
    #Calulate trend strength over the last 30 days predicted days
	last_30_days = brand_forecast.tail(30)
	overall_mean = brand_forecast["yhat"].mean()
	trend_strength = (last_30_days["yhat"].mean() - overall_mean) / brand_forecast["yhat"].std()
	print(f"{brand.capitalize()}: {trend_strength:.2f}")


Trending Brands:
Nike: 0.84
Google: 2.86
Amazon: 1.44
Samsung: 2.67
Microsoft: -0.46
