<a href="https://colab.research.google.com/github/mortezaaghajanzadeh/Machine-learning-in-Finance/blob/main/Lecture%206/Transformer_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Install modules.**

In [None]:
!pip install transformers
!pip install wordcloud

#### **Import modules.**

In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

#### **Scrape example text.**

In [None]:
# Define header.
url = "https://www.federalreserve.gov/newsevents/speech/yellen20170926a.htm"

# Define user-agent string.
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/90.0.4430.85"}

# Define contents of GET request.
req = Request(url, headers=headers)

# Send get request.
html = urlopen(req)

# Parse HTML.
soup = BeautifulSoup(html.read())

# Extract paragraphs from speech.
speech = soup.find_all('p')

# Get text from each paragraph.
speech = [p.text for p in speech]

# Print list of speech paragraphs.
print(speech)

#### **Load transformer models and tokenizers.**

In [None]:
# Load distilbert base model.
classification_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer_classification = AutoTokenizer.from_pretrained(classification_model_name)
model_classification = AutoModelForSequenceClassification.from_pretrained(classification_model_name)

# Load t5-small model.
summarization_model_name = "t5-small"
tokenizer_summarization = AutoTokenizer.from_pretrained(summarization_model_name)
model_summarization = AutoModelForSeq2SeqLM.from_pretrained(summarization_model_name)

#### **Instantiate pipelines.**

In [None]:
# Instantiate sentiment classification pipeline.
sentiment_classifier = pipeline("sentiment-analysis", model=model_classification, tokenizer=tokenizer_classification)

# Instantiate text summarization pipeline.
summarizer = pipeline("summarization", model=model_summarization, tokenizer=tokenizer_summarization)

#### **Apply pipeline to classify and summarize text.**

In [None]:
# Define empty lists to hold results.
summaries = []
sentiment_scores = []

# Iterate through speech paragraphs and apply sentiment classification and text summarization
for idx, paragraph in enumerate(speech):
    # Sentiment classification
    sentiment_result = sentiment_classifier(paragraph)[0]

    # Recover label and score.
    sentiment_label = sentiment_result["label"]
    sentiment_score = sentiment_result["score"]

    # Change sign if negative.
    if sentiment_label == "NEGATIVE":
        sentiment_score *= -1

    # Append score to list.
    sentiment_scores.append(sentiment_score)

    # Text summarization
    summary = summarizer(paragraph, max_length=50, min_length=25, do_sample=False)[0]["summary_text"]
    summaries.append(summary)

    # Print results
    print(f"Paragraph {idx + 1}:")
    print(f"Sentiment: {sentiment_label} (Score: {sentiment_score:.2f})")
    print(f"Summary: {summary}\n")

#### **Visualize results.**

In [None]:
# Define function to generate tick labels.
def custom_tick_labels(n, step):
    labels = ["" for _ in range(n)]
    for i in range(0, n, step):
        labels[i] = f"P{i+1}"
    return labels

# Tick label parameters.
num_paragraphs = len(sentiment_scores)
tick_step = num_paragraphs // 10 if num_paragraphs > 10 else 1

# Generate bar chart.
plt.figure(figsize=(12, 6))
plt.bar(range(1, num_paragraphs + 1), sentiment_scores, tick_label=custom_tick_labels(num_paragraphs, tick_step))
plt.xlabel("Paragraph")
plt.ylabel("Adjusted Sentiment Score")
plt.title("Adjusted Sentiment Scores for Each Paragraph")
plt.axhline(y=0, color='r', linestyle='-')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Generate a word cloud from the summarized text.
wordcloud = WordCloud(width=800, height=400, background_color="white", colormap="viridis").generate(" ".join(summaries))

# Display the word cloud.
plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()