# Initial Exploration

Here we will do some quick EDA to determine what the plan will be 
to tackle the questions on the assignment.

In [None]:
import pandas as pd

blog_df = pd.read_parquet('./data-science/blog/challenge-blog-00000.snappy.parquet')
blog_df.head()

In [None]:
blog_df.describe(include='all')

In [None]:
blog_df.info()

In [None]:
blog_df.language.value_counts()

# Create a Plan

The main set of tasks are the following:
- Analyze negative social media posts
  - Prioritize negative topics "Which fire to put out"
- Common topics between brand and competitors
  - Differentiating factors between brand and competitors
- Identify posts that implicitly reference brand
- Identify trends before becoming obvious

## ID trends before becoming obvious
- Use sentiment analysis on title, text
- Use NER, find most common brand
- For each brand, find sentiment/emotions over time 
  - For explicit brands
  - For implicit brands
- Graph of brand's time-range vs num_negative_posts   

## Some Considerations
- Translate non-en to english
- Use model for NER the body and title
- Use model for Sentiment Analysis the body and title
- Change date to datetime
- Do topic modeling to get most common words


# Limitations
- Only English data will be considered due to computation and time constraints
  - However, translation is a task that would take a while. Just for the titles, it took > 45min for a small model.add()
- Sentiment Analysis and Emotion models have a limit of 512 characters. Normally I would chunk the data, and get an average of each piece.
  - However, due to computation and time constraints, I will simply truncate the data 

---

# Experiment with Hugging Face Models

In [3]:
from __future__ import annotations
from dataclasses import dataclass

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



@dataclass
class TranslationModel:
    tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
    model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")


def translate_to_en(text):
    TranslationModel.tokenizer.tgt_lang = "en" # type: ignore
    encoded_hi = TranslationModel.tokenizer(text, return_tensors="pt")
    generated_tokens = TranslationModel.model.generate(**encoded_hi)
    translated_text = TranslationModel.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated_text

sentence = "hola como estan"
translate_to_en(sentence)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


@dataclass
class NERModel:
    nlp = pipeline(
        "ner", 
        model=AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner"), 
        tokenizer=AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner"), 
        grouped_entities=True,
    )
    


def get_ner_properties(text):
    # Contains 4 entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC).
    ner_results = NERModel.nlp(text)
    return ner_results


example = "My name is Wolfgang and I live in Berlin"
get_ner_properties(text=example)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline


@dataclass
class SentimentTask:
    model = pipeline(
        "sentiment-analysis", 
        model=BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3), # type: ignore
        tokenizer=BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis"),
    ) 


@dataclass
class EmotionTask:
    model = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
    
    
@dataclass
class EmotionTaskLarge:
    model = pipeline("text-classification", model='SamLowe/roberta-base-go_emotions', return_all_scores=True)

# Add Columns for Sentiment and Emotion

In [None]:
# Extract English and non-English texts first

en_blogs = (blog_df.language == 'en')
en_blog_df = blog_df.loc[en_blogs]
non_en_blog_df = blog_df.loc[~en_blogs]

In [None]:
ner_titles = en_blog_df.title.map(lambda title: get_ner_properties(title))
ner_body = en_blog_df.title.map(lambda body: get_ner_properties(body))

en_blog_df['ner_title'] = ner_titles
en_blog_df['ner_body'] = ner_body

In [None]:
en_blog_df.publish_date = pd.to_datetime(en_blog_df.publish_date, unit='s')

In [None]:
# WARNING: Takes around 25min to run
MAX_TOKENS = 512

def truncate(text, max_tokens):
    return text[:max_tokens]
    
en_blog_df = en_blog_df[(~en_blog_df.body.isna()) & (~en_blog_df.body.isna())]

blog_title_sentiment = en_blog_df.title.map(lambda title: SentimentTask.model(truncate(title or '', max_tokens=MAX_TOKENS)))
blog_title_emotion = en_blog_df.title.map(lambda title: EmotionTask.model(truncate(title or '', max_tokens=MAX_TOKENS)))

blog_body_sentiment = en_blog_df.body.map(lambda title: SentimentTask.model(title[:MAX_TOKENS]))
blog_body_emotion = en_blog_df.body.map(lambda title: EmotionTask.model(title[:MAX_TOKENS]))

In [None]:
en_blog_df['title_sentiment'] = blog_title_sentiment
en_blog_df['title_emotion'] = blog_title_emotion
en_blog_df['body_sentiment'] = blog_body_sentiment
en_blog_df['body_emotion'] = blog_body_emotion

In [None]:
en_blog_df.title_sentiment = en_blog_df.title_sentiment.map(lambda sentiment: sentiment[0]['label'])
en_blog_df.body_sentiment = en_blog_df.body_sentiment.map(lambda sentiment: sentiment[0]['label'])

# Finding Brands

The assignment doesn't give us the brand. However, we can look at the top brands that appear and keep it ambiguous such that we can recalculate 
some task for any subset of brands.

To do this, we will use a Name Entity Recognition transfer learning model to give us the list of possible brands that exist in each text.

In [None]:
def filter_organizations(entities):
    return [entity['word'] for entity in entities if entity['entity_group'] == 'ORG']

en_blog_organizations = en_blog_df.ner_title.map(lambda entities: filter_organizations(entities))
en_blog_organizations

In [None]:
all_organizations = {}

def add_organization(org, all_organizations):
    if org not in all_organizations:
        all_organizations[org] = 0
    all_organizations[org] += 1
    return None

en_blog_organizations.map(lambda orgs: [add_organization(org, all_organizations) for org in orgs if org])

organizations_with_most_counts_titles = sorted(all_organizations.items(), key=lambda x: -x[1])
organizations_with_most_counts_titles[:10]

# Get most common brands

From the competitor brands found in the data, we can create a list that will help us compare and contrast
in the future. We can just assume the brand and competitors for now.

In [None]:
BRAND = 'United Airlines'

COMPETITOR_BRANDS = [
    'Southwest Airlines',
    'Spirit Airlines',
    'American Airlines',
    'United',
    'Alaska Airlines',
    'JetBlue'
]

IMPLICIT_IDENTIFIERS = [
    'Airline',
    'Airlines',
    'General Aviation',
    'FAA',
    'Boeing'
]

# Add Brand Sentiment

In [None]:
def check_brands(orgs):
    brands = [org for org in orgs if org in COMPETITOR_BRANDS or org == BRAND or org in IMPLICIT_IDENTIFIERS]
    return brands 

brands = en_blog_df.en_blog_organizations.map(lambda orgs: check_brands(orgs))
en_blog_df['brands'] = brands

In [None]:
filter_no_brands = en_blog_df['brands'].str.len() != 0
blog_brands = en_blog_df[filter_no_brands]

# Negative Posts Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")


class SentimentAnalysis:
    
    @staticmethod
    def sentiment_graph(brands_df, feature, resample_by, strftime='%Y-%m-%d'):
        timeseries_df = brands_df.set_index('publish_date')

        negative_sentiment = timeseries_df[feature] == 'negative'
        neutral_sentiment = timeseries_df[feature] == 'neutral'
        positive_sentiment = timeseries_df[feature] == 'positive'

        neg_trends = timeseries_df.loc[negative_sentiment, feature].resample(resample_by).count().reset_index()
        neu_trends = timeseries_df.loc[neutral_sentiment, feature].resample(resample_by).count().reset_index()
        pos_trends = timeseries_df.loc[positive_sentiment, feature].resample(resample_by).count().reset_index()

        neg_trends['label'] = 'negative'
        neu_trends['label'] = 'neutral'
        pos_trends['label'] = 'positive'

        all_labels_trends = pd.concat([neg_trends, neu_trends, pos_trends])

        all_labels_trends = all_labels_trends.set_index('publish_date').reset_index()
        all_labels_trends.publish_date = all_labels_trends.publish_date.dt.strftime(strftime)
        
        return all_labels_trends
    
    @staticmethod
    def plot(df, title, x, y):
        _, ax = plt.subplots(figsize = (10,4))

        ax = sns.pointplot(x=x, y=y, data=df, hue='label', palette={'negative': 'red', 'neutral': 'gray', 'positive': 'green'})
        ax.tick_params(axis='x', labelrotation = 45)
        ax.set_title(title)
        
        plt.show()
        return None


blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=blog_brands, feature='title_sentiment', resample_by='H', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Title Sentiment Analysis Over Time', x='publish_date', y='title_sentiment')

In [None]:
blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=blog_brands, feature='title_sentiment', resample_by='D', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Title Sentiment Analysis Over Time', x='publish_date', y='title_sentiment')

In [None]:
blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=blog_brands, feature='body_sentiment', resample_by='D', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Body Sentiment Analysis Over Time', x='publish_date', y='body_sentiment')

In [None]:
blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=blog_brands, feature='body_sentiment', resample_by='H', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Body Sentiment Analysis Over Time', x='publish_date', y='body_sentiment')

## Negative Topics

- Within negative topics, get the emotions of title and body over time

In [None]:
blog_brands = blog_brands[~blog_brands.body.isna()]
negative_sentiment = blog_brands.body_sentiment == 'negative'
positive_sentiment = blog_brands.body_sentiment == 'positive'


negative_blogs = blog_brands[negative_sentiment]
negative_blogs

In [None]:
negative_blogs['title_emotion_lg'] = negative_blogs.title.map(lambda title: EmotionTaskLarge.model(title))
negative_blogs['body_emotion_lg'] = negative_blogs.body.map(lambda body: EmotionTaskLarge.model(body[:MAX_TOKENS]))

In [None]:
negative_blogs.title_emotion_lg = negative_blogs.title_emotion_lg.map(lambda emotions: {d['label']:d['score'] for d in emotions[0]})
negative_blogs.body_emotion_lg = negative_blogs.body_emotion_lg.map(lambda emotions: {d['label']:d['score'] for d in emotions[0]})

In [None]:
# sorted(negative_blogs.title_emotion_lg[407].items(), key=lambda x: -x[1])
negative_blogs_title_emotions = negative_blogs[['publish_date', 'title_emotion_lg']]

negative_blogs_title_emotions.title_emotion_lg = negative_blogs_title_emotions.title_emotion_lg.map(lambda emotions: emotions.items())


In [None]:
negative_blogs_title_emotions = negative_blogs_title_emotions.explode('title_emotion_lg')
negative_blogs_title_emotions

In [None]:
hue = negative_blogs_title_emotions.title_emotion_lg.map(lambda emotion: emotion[0])
emotion_percents = negative_blogs_title_emotions.title_emotion_lg.map(lambda emotion: emotion[1])

negative_blogs_title_emotions['hue'] = hue
negative_blogs_title_emotions['emotion_percents'] = emotion_percents

negative_blogs_title_emotions = negative_blogs_title_emotions.drop(columns='title_emotion_lg')
negative_blogs_title_emotions

In [None]:
negative_blogs_title_emotions_plt = negative_blogs_title_emotions.set_index('publish_date').groupby('hue').resample('D').mean().fillna(0).reset_index()
negative_blogs_title_emotions_plt.publish_date = negative_blogs_title_emotions_plt.publish_date.dt.strftime('%Y-%m-%d')
negative_blogs_title_emotions_plt.head(20)

In [None]:
negative_emotions = [
    'anger', 'annoyance', 'confusion', 'disappointment',
    'disapproval', 'disgust', 'embarrassment', 'fear',
    'grief', 'nervousness', 'remorse', 'sadness',
]

emotions_filter = negative_blogs_title_emotions_plt.hue.isin(negative_emotions)

In [None]:
_, ax = plt.subplots(figsize = (20,6))

ax = sns.pointplot(x='publish_date', y='emotion_percents', data=negative_blogs_title_emotions_plt[emotions_filter], hue='hue', )
ax.tick_params(axis='x', labelrotation = 45)
ax.set_title('Emotions Over Time')

# plt.legend([],[], frameon=False)
plt.legend(loc="upper left", mode = "expand", ncol = 3) #
plt.show()

# Brand Sentiment Over Time
- group brands, counts of sentiment over time for titles and body
- match group with pos/neg sentiment

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")


class BrandSentiment:
    
    @staticmethod
    def brand_filtering(blog_brands, brands, feature):
        brand_sentiment = blog_brands[[feature, 'brands', 'publish_date']]
        brand_sentiment = brand_sentiment.rename(columns={feature: 'sentiment'})

        brand_sentiment.brands = brand_sentiment.brands.map(lambda brand: brand[0])
        brand_filter = brand_sentiment.brands.isin(brands)
        brand_sentiment = brand_sentiment[brand_filter]
        return brand_sentiment
    
    @staticmethod
    def resample(brand_sentiment, by):
        brand_sentiment = brand_sentiment[['publish_date', 'brands', 'sentiment']].set_index('publish_date')
        brand_sentiment = brand_sentiment.groupby(['brands', 'sentiment']).resample(by).agg(count=('sentiment', 'count')).reset_index()
        
        brand_sentiment.publish_date = brand_sentiment.publish_date.dt.strftime('%Y-%m-%d')
        return brand_sentiment
    
    @staticmethod
    def plot(df, title, figsize=(10, 4)):
        _, ax = plt.subplots(figsize=figsize)

        ax = sns.pointplot(x='publish_date', y='count', hue='sentiment', data=df, palette={'negative': 'red', 'neutral': 'gray', 'positive': 'green'})
        ax.tick_params(axis='x', labelrotation = 45)
        ax.set_title(title)
        
        plt.show()
        return None


In [None]:
# 'title_sentiment', 'body_sentiment'
brand_sentiment = BrandSentiment.brand_filtering(blog_brands, brands=COMPETITOR_BRANDS + [BRAND], feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='All Brands Body Sentiment Over Time')

In [None]:
# 'title_sentiment', 'body_sentiment'
brand_sentiment = BrandSentiment.brand_filtering(blog_brands, brands=['United'], feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='Brand Body Sentiment Over Time')


In [None]:
# 'title_sentiment', 'body_sentiment'
brand_sentiment = BrandSentiment.brand_filtering(blog_brands, brands=COMPETITOR_BRANDS, feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='Competitor Brands Body Sentiment Over Time')

In [None]:
brand_sentiment = BrandSentiment.brand_filtering(blog_brands, brands=IMPLICIT_IDENTIFIERS, feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='Competitor Brands Body Sentiment Over Time')