In [2]:
from dataclasses import dataclass


@dataclass
class Config:
    MAX_TOKENS = 512
    BRAND = 'United Airlines'
    COMPETITOR_BRANDS = ['Southwest Airlines', 'Spirit Airlines', 'American Airlines', 'United', 'Alaska Airlines', 'JetBlue']
    IMPLICIT_IDENTIFIERS = ['Airline', 'Airlines', 'General Aviation', 'FAA', 'Boeing']

# Data Cleaning

In [None]:
from utils.data_cleaning import DataCleaning


def data_cleaning(path):
    data_df = DataCleaning.read_data(path)
    data_df = DataCleaning.base_filtering(df=data_df)
    
    data_df = DataCleaning.add_ner_feature(df=data_df, feature='title', max_tokens=Config.MAX_TOKENS)
    data_df = DataCleaning.add_ner_feature(df=data_df, feature='body', max_tokens=Config.MAX_TOKENS)
    
    data_df = DataCleaning.add_sentiment_feature(df=data_df, feature='title', max_tokens=Config.MAX_TOKENS)
    data_df = DataCleaning.add_sentiment_feature(df=data_df, feature='body', max_tokens=Config.MAX_TOKENS)
    
    data_df = DataCleaning.add_brands(df=data_df, feature='title', brands_to_filter=([Config.BRAND] + Config.COMPETITOR_BRANDS + Config.IMPLICIT_IDENTIFIERS))
    return data_df


data_df = data_cleaning(path='data-science/social/challenge-social-00000.snappy.parquet')
data_df

# Add Sentiment and Emotion Columns

In [None]:
from utils.sentiment_analysis import SentimentAnalysis


blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=data_df, feature='title_sentiment', resample_by='H', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Title Sentiment Analysis Over Time', feature='title')

In [None]:
blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=data_df, feature='title_sentiment', resample_by='D', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Title Sentiment Analysis Over Time', feature='title')

In [None]:
blog_title_sentiment_ts = SentimentAnalysis.sentiment_graph(brands_df=data_df, feature='body_sentiment', resample_by='D', strftime='%Y-%m-%d')
SentimentAnalysis.plot(blog_title_sentiment_ts, title='Body Sentiment Analysis Over Time', feature='body')

# Most Common Topics

In [None]:
from utils.topics import Topics


negative_body_sentiments_df = Topics.filter_sentiment(df=data_df, feature='body', sentiment='negative')
negative_body_topics = Topics.all_topics(df=negative_body_sentiments_df, feature='body')
top_negative_body_topics = Topics.top_topics(topics=negative_body_topics, filter_by_sentiment='negative')

Topics.plot(df=top_negative_body_topics, title='Most Common Topics in Body Negative Sentiment', top_k=20)


# Emotion Analysis

In [None]:
from utils.emotion_analysis import EmotionsAnalysis


negative_blogs = EmotionsAnalysis.get_sentiment(df=data_df, sentiment='negative', feature='body')
negative_blogs = EmotionsAnalysis.add_lg_emotions(df=negative_blogs, feature='title', max_tokens=Config.MAX_TOKENS)
negative_blogs = EmotionsAnalysis.resample(df=negative_blogs, by='D', strftime='%Y-%m-%d')
negative_blogs = EmotionsAnalysis.filter_negative_emotions(df=negative_blogs)

EmotionsAnalysis.plot(df=negative_blogs, title='Title Emotions Over Time')

In [None]:
from utils.brand_sentiment import BrandSentiment


# 'title_sentiment', 'body_sentiment'
brand_sentiment = BrandSentiment.brand_filtering(data_df, brands=Config.COMPETITOR_BRANDS + [Config.BRAND], feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='All Brands Body Sentiment Over Time')

In [None]:
# 'title_sentiment', 'body_sentiment'
brand_sentiment = BrandSentiment.brand_filtering(data_df, brands=['United'], feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='Brand Body Sentiment Over Time')

In [None]:
# 'title_sentiment', 'body_sentiment'
brand_sentiment = BrandSentiment.brand_filtering(data_df, brands=Config.COMPETITOR_BRANDS, feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='Competitor Brands Body Sentiment Over Time')

In [None]:
brand_sentiment = BrandSentiment.brand_filtering(data_df, brands=Config.IMPLICIT_IDENTIFIERS, feature='body_sentiment')
brand_sentiment = BrandSentiment.resample(brand_sentiment, by='D')
BrandSentiment.plot(brand_sentiment, title='Competitor Brands Body Sentiment Over Time')