# Initial Exploration

Here we will do some quick EDA to determine what the plan will be 
to tackle the questions on the assignment.

In [None]:
import pandas as pd

blog_df = pd.read_parquet('./data-science/blog/challenge-blog-00000.snappy.parquet')
blog_df.head()

In [None]:
blog_df.describe(include='all')

In [None]:
blog_df.info()

In [None]:
blog_df.language.value_counts()

# Create a Plan

The main set of tasks are the following:
- Analyze negative social media posts
  - Prioritize negative topics "Which fire to put out"
- Common topics between brand and competitors
  - Differentiating factors between brand and competitors
- Identify posts that implicitly reference brand
- Identify trends before becoming obvious

## ID trends before becoming obvious
- Use sentiment analysis on title, text
- Use NER, find most common brand
- For each brand, find sentiment/emotions over time 
  - For explicit brands
  - For implicit brands
- Graph of brand's time-range vs num_negative_posts   

## Some Considerations
- Translate non-en to english
- Use model for NER the body and title
- Use model for Sentiment Analysis the body and title
- Change date to datetime
- Do topic modeling to get most common words


# Limitations
- The original plan was to first translate the non-English data
  - However, translation is a task that would take a while. Just for the titles, it took > 45min for a small model.add()
  - Therefore I will filter the English lang values in this assessment

---

# Experiment with Hugging Face Models

In [3]:
from __future__ import annotations
from dataclasses import dataclass

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



@dataclass
class TranslationModel:
    tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
    model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")


def translate_to_en(text):
    TranslationModel.tokenizer.tgt_lang = "en" # type: ignore
    encoded_hi = TranslationModel.tokenizer(text, return_tensors="pt")
    generated_tokens = TranslationModel.model.generate(**encoded_hi)
    translated_text = TranslationModel.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return translated_text

sentence = "hola como estan"
translate_to_en(sentence)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


@dataclass
class NERModel:
    nlp = pipeline(
        "ner", 
        model=AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner"), 
        tokenizer=AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner"), 
        grouped_entities=True,
    )
    


def get_ner_properties(text):
    # Contains 4 entities: location (LOC), organizations (ORG), person (PER) and Miscellaneous (MISC).
    ner_results = NERModel.nlp(text)
    return ner_results


example = "My name is Wolfgang and I live in Berlin"
get_ner_properties(text=example)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline


@dataclass
class SentimentTask:
    model = pipeline(
        "sentiment-analysis", 
        model=BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3), # type: ignore
        tokenizer=BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis"),
    ) 


@dataclass
class EmotionTask:
    model = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
    
    
@dataclass
class EmotionTaskLarge:
    model = pipeline("text-classification", model='SamLowe/roberta-base-go_emotions', return_all_scores=True)

# Add Columns for Sentiment and Emotion

In [None]:
# Extract English and non-English texts first

en_blogs = (blog_df.language == 'en')
en_blog_df = blog_df.loc[en_blogs]
non_en_blog_df = blog_df.loc[~en_blogs]

In [None]:
ner_titles = en_blog_df.title.map(lambda title: get_ner_properties(title))
ner_body = en_blog_df.title.map(lambda body: get_ner_properties(body))

en_blog_df['ner_title'] = ner_titles
en_blog_df['ner_body'] = ner_body

In [None]:
en_blog_df.publish_date = pd.to_datetime(en_blog_df.publish_date, unit='s')

In [None]:
MAX_TOKENS = 512

def truncate(text, max_tokens):
    return text[:max_tokens]
    
blog_title_sentiment = en_blog_df.title.map(lambda title: SentimentTask.model(truncate(title or '', max_tokens=MAX_TOKENS)))