# Assignment Techdome, Indore

### Install libraries
Run below cell to install required libraries

In [None]:
!pip install --upgrade pandas numpy openpyxl nltk bert-extractive-summarizer transformers langchain==0.1.4 openai==1.10.0 tiktoken python-dotenv scikit-learn langchain-openai
!python -m spacy download en_core_web_sm

### Import libraries
Important libraries overview
* `re`: modifying strings with regular expression 
* `nltk, spacy`: for text prerocessing and sentiment analysis
* `pandas, numpy`: efficient data manipulation
* `summarizer`: for extractive summarization
* `dotenv`: load environment variables from .env file. Import openai api key OPENAI_API_KEY to environment variable
* `langchain`: for prompt creation and interacting with openai api
* `langchain.embeddings`: Create document embedding using openai embeddings model
* `sklearn.cluster`: KMeans for clustering similar documents using document embeddings

In [196]:
import re
import nltk
import pandas as pd
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from summarizer import TransformerSummarizer
import textwrap
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
#from langchain import LLMChain
from langchain import OpenAI
from langchain.prompts import PromptTemplate
from langchain.prompts import FewShotPromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import OpenAIEmbeddings
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from sklearn.cluster import KMeans
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
load_dotenv(".env")

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/patelankit706/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/patelankit706/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patelankit706/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Preprocess data
Preprocess data using spacy, nltk. Removing stopwords, lowercasing, lemmatization, punctuation removal

In [262]:
## Load data
df = pd.read_excel("Assignment.xlsx")
articles = df['Article'].to_list()

In [252]:
## punctuation and custom stop words
stop_words = {'all', 'who', 'so', 'some', 'whom', 'have', 'any', 'did', 'be', 'me',
              'mine', 'a', 'this', 'i', 'at', 'an', 'between', 'below', 'was', 'why',
              'it', 'is', 'he', 'above', 'that', 'itself', 'or', 'does', 'on', 'here',
              'as', 'the', 'has', 'down', 'for', 'until', 'of', 'own', 'other', 'do', 'both',
              'same', 'if', 'while', 'she', 'to', 'were'}

punctuation = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~\n“”'

In [46]:
def decontracted(phrase):
    phrase = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', phrase)
    phrase = re.sub(r"\'t|’t", "not", phrase)
    phrase = re.sub(r"\'re|’re", " are", phrase)
    phrase = re.sub(r"\'s|’s|—", " ", phrase)
    phrase = re.sub(r"\'d|’d", " would", phrase)
    phrase = re.sub(r"\'ll|’ll", " will", phrase)
    phrase = re.sub(r"\'ve|’ve", " have", phrase)
    phrase = re.sub(r"\'m|’m", " am", phrase)
    return phrase

def preprocess_nltk(text, punctuation="", stopwords=set(), decontracted=None, lemmatizer=False, lowercase=False):
    if decontracted:
        text = decontracted(text)
    tokens = word_tokenize(text)    
    
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in punctuation]
    if lemmatizer:
        lemmatizer = WordNetLemmatizer()
        filtered_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    #sentence = re.sub(r" \.",".",' '.join(lemmatized_tokens))
    sentence = ' '.join(filtered_tokens)
    if lowercase: sentence = sentence.lower()
    return sentence
    

In [28]:
def preprocess_spacy(text, stop_words=set()):
    nlp = spacy.load("en_core_web_sm")
    # Process whole documents
    doc = nlp(text)
    
    # Tokenization, Lemmatization, Punctuation and Stop words removal
    tokens = [token.lemma_ for token in doc if not token.is_punct and token.text not in stop_words]
    
    sentence= re.sub(r"\n+",""," ".join(tokens))
    return sentence


In [32]:
#preprocessed_articles_nltk_stopwords = [preprocess(i, punctuation, STOP_WORDS, None, True, True) for i in articles]
preprocessed_articles_spacys = [preprocess_spacy(i, STOP_WORDS) for i in articles]

In [236]:
preprocessed_articles_spacys[2]

'Sept 14 Reuters Bristol Myers Squibb BMY.N say Thursday plan double number treatment test clinical trial focus cell therapy 18 month contend increase generic competition sell drug  the drugmaker currently candidate trial advance research pipeline include cell therapy target immune system disorder different type cancer  the New York base company pressure decline demand drug blood cancer treatment Revlimid blood thin Eliquis face generic competition  Bristol partner Pfizer pfe.n blood thin Eliquis list 10 drug subject price negotiation U.S. Medicare health program  the company recently receive regulatory approval new cell therapy manufacturing facility Devens Massachusetts Bristol say continue expand manufacturing capacity  Bristol approve cell therapy U.S. Breyanzi Abecma target different blood cancer indication say plan continue development treatment disease lupus erythematosus multiple sclerosis  the drugmaker host R&D day Thursday executive expect provide detail company research str

### VADER sentiment analysis

Valence aware dictionary for sentiment reasoning (VADER) is popular rule-based sentiment analyzer. 
<br>
It uses a list of lexical features (e.g. word) which are labeled as positive or negative according to their semantic orientation to calculate the text sentiment.   

Vader sentiment returns the probability of a given input sentence to be 
Positive, negative, and neutral. 

For our task Vader will not provide good accuracy as its not trained on data specific to our domain requirement. For initial results lets go with this model only. For getting better result we can fine tune already available models.

In [39]:
def getSentiment(score):
    if score>0.05:
        return "Positive"
    elif score>-0.05:
        return "Neutral"
    else:
        return "Negative"

In [42]:
analyzer = SentimentIntensityAnalyzer()
sentiment_vader = [getSentiment(analyzer.polarity_scores(i.lower())['compound']) for i in preprocessed_articles_spacys]

In [44]:
np.array(sentiment_vader)

array(['Positive', 'Negative', 'Negative', 'Negative', 'Negative',
       'Negative', 'Positive', 'Negative', 'Positive', 'Positive',
       'Negative', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Positive', 'Positive', 'Positive',
       'Positive', 'Negative', 'Positive', 'Positive', 'Positive'],
      dtype='<U8')

In [57]:
# Save sentiment in dataframe
df["vader_sentiment"]=sentiment_vader

### Summarization
Lets get the summaries of our articles.<br>
There can be following 3 approaches to get the summary:
1. Extractive
2. Abstractive
3. Hybrid
<br>
We will go with Abstractive approach. An example of extractive summarization is given below

#### Extractive summarization
Extractive summarization selects important sentences from the provided text in the summary without modification.<br>
For this task we gre using gpt-2 summarizer

In [47]:
#custom stop words
preprocessed_articles = [preprocess_nltk(i, punctuation, stop_words, decontracted) for i in articles]

In [48]:
GPT2_model = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2-medium")



In [49]:
''.join(GPT2_model(preprocessed_articles[8], min_length=60))

'BRUKINSA first and only BTK inhibitor approved follicular lymphoma in European Union Approval based results from ROSEWOOD trial in which BRUKINSA plus anti-CD20 monoclonal antibody obinutuzumab achieved higher overall response rate compared obinutuzumab alone BeiGene Ltd. BGNE HKEX 06160 SSE 688235 global biotechnology company today announced European Commission EC granted marketing authorization BRUKINSA® zanubrutinib in combination with obinutuzumab treatment adult patients with relapsed refractory R/R follicular lymphoma FL received least two prior lines systemic therapy . `` With approval we are excited announce BRUKINSA will become available treatment option patients with follicular lymphoma in European Union . `` results from ROSEWOOD trial demonstrated significant clinical benefit BRUKINSA plus obinutuzumab patients with relapsed refractory follicular lymphoma . global BRUKINSA development program includes more than 5,000 subjects enrolled date in 29 countries and regions . FL 

#### Abstractive summarization
Abstractive summarization rephrases sentences for summary.
<br>
We will be using openai gpt-3 model api for summarization of the articles. Since some articles are of greater size than the context window of the llm. We can split documents into small chunks and find the summary for each chunk. We can then get final summary using the previous chunks combined summary. This method is called map reduce.

In [50]:
articles_llm_text = [re.sub(r"[\n]+", " ", i) for i in articles]

In [51]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=50, separators=[" ", ",", "\n"]
)

In [53]:
texts = text_splitter.split_text(articles_llm_text[0])

In [237]:
texts[1]

'to study this,” Michael Johnson, the chief executive of the nutritional supplement maker Herbalife, told investors. “And when we see an opportunity to capitalize on it, we will.” In theory, that opportunity — both for making profits and for losing fortunes — could be vast not only for the companies behind these drugs but also for some in completely different industries. Known as GLP-1 drugs, the medications are already driving big profits. Novo Nordisk makes both Ozempic, which has been approved only for Type 2 diabetes, and its close relative Wegovy, which has been approved for weight loss. They mimic a glucagon-like peptide that regulates appetite in the brain, leaving people feeling sated for hours. Together, they helped send Novo’s earnings rocketing up 32 percent in the first half of this year, and Novo’s market value is now larger than the entire Danish economy. Eli Lilly’s sales surged 28 percent in the second quarter, thanks to another diabetes drug, Mounjaro, which the Food'

In [54]:
docs = [Document(page_content=t) for t in texts]

In [55]:
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0)

  warn_deprecated(


In [70]:
chain = load_summarize_chain(llm,chain_type="map_reduce")
output_summary = chain.run(docs)
wrapped_text = textwrap.fill(output_summary, width=100)
print(wrapped_text)

 The rise of diabetes and weight loss drugs like Ozempic has had an impact on retailers and food
manufacturers. Companies like Herbalife and Novo Nordisk have seen significant increases in earnings
and market value due to the success of these drugs. The FDA may approve a new weight loss drug,
Mounjaro, this year, leading to increased sales for retailers. However, these drugs come with side
effects and there is still room for other approaches to fighting obesity. The departure of Emily
Weiss as CEO of Glossier has sparked discussion about the "girlboss" archetype in media and the
challenges faced by female-led companies. Glossier has undergone changes and entered the retail
market in preparation for a potential exit.


In [61]:
def summarize_all_articles_docs(articles, chain, text_splitter):
    docs_list = [[Document(page_content=t) for t in text_splitter.split_text(article)] for article in articles_llm_text]
    summaries = [chain.run(i) for i in docs_list]
    return docs_list, summaries
    

In [64]:
docs_list, summaries = summarize_all_articles_docs(articles_llm_text, chain, text_splitter)

In [71]:
summaries[-2]

' "Get Smart About News is a free newsletter and e-learning platform that helps educators teach students about identifying credible information and understanding the First Amendment. A recent investigation found that registered dietitians on social media were not transparent about their partnerships with food and beverage companies. China has been using disinformation campaigns to spread false information about the Hawaii wildfires, highlighting the importance of recognizing and addressing false claims. News literacy skills, like reverse image searches, can prevent people from falling for conspiracy theories. The claim about airlines using odd seating is false and an example of slippery slope thinking used to spread misinformation."'

In [72]:
# Save abstractive summaries in dataframe
df["abstractive_summary"]=summaries

### Sentiment of the articles using Few Shot Learning
We will now use llm for sentiment analysis of the articles.<br>
In this case we are using few shot learning for providing context to llm model with some examples.

We are using the abstractive summaries(previously generated), for getting sentiments

In [74]:
# create our examples
examples = [
    {
        "query": "Scientists develop a new solar panel that's more efficient than ever before. This could help us fight climate change!",
        "answer": "Positive"
    }, {
        "query": "The economy is struggling, and many people are losing their jobs. This is a worrying trend.",
        "answer": "Negative"
    }, {
        "query": "A new report details the latest scientific findings on climate change. The report is informative and well-written.",
        "answer": "Neutral"
    }
]

# create an example template
example_template = """
User: {query}
AI: {answer}
"""

# create a prompt example from above template
example_prompt = PromptTemplate(
    input_variables=["query", "answer"],
    template=example_template
)

# now break our previous prompt into a prefix and suffix
# the prefix is our instructions
prefix = """The following are excerpts from conversations with an AI
assistant. The assistant is known for providing sentiment whether the user's input paragraph is Neutral, Positive or Negative. Here are some
examples:
"""
# and the suffix our user input and output indicator
suffix = """
User: {query}
AI: """

# now create the few-shot prompt template
few_shot_prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["query"],
    example_separator="\n\n"
)

In [75]:
chain = LLMChain(llm=llm, prompt=few_shot_prompt_template)
chain.run(summaries[0])

'Neutral'

In [76]:
sentiments = [chain.run(i) for i in summaries]

In [77]:
print(sentiments)

['Neutral', 'Neutral', 'Positive', 'Positive', 'Positive', 'Negative', 'Positive', 'Neutral', 'Positive', 'Neutral', 'Neutral', 'Positive', 'Positive', 'Neutral', 'Neutral', 'Positive', 'Neutral', 'Positive', 'Positive', 'Neutral', 'Positive', 'Neutral', 'Positive', 'Neutral', 'Neutral']


In [78]:
# save the sentiment output of llm in dataframe
df["sentiment_gpt"] = sentiments

### Common theme between articles
There are various methods through with we can find common topics or themes between texts like LDA for unsupervised topic decomposition.

Here we are using embeddings, custering and LLM together to get the most representative common theme of the different clusters of article.

Methodology:
1. We will divide each documents or article into small chunk. Also keep track of the article to which particular chunk belongs.
2. Find the embeddings of all the chunks of all articles
3. Utilize kmeans clustering for grouping of chunks in clusters.
4. Find the articles index present in each cluster utilizing the article index tracker we created in step 1.
5. For each cluster utilize llm map reduce to find the common theme in each cluster of articles.

#### 1. We will divide each documents or article into small chunk. Also keep track of the article to which particular chunk belongs.

In [84]:
#divide each documents or article into small chunk
text_splitter_2 = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=50, separators=[" ", ",", "\n"])

In [85]:
#keep track of the article to which particular chunk belongs
index_text = []
splitted_texts = []
for i, article in enumerate(preprocessed_articles_spacys):
    for t in text_splitter_2.split_text(article):
        splitted_texts.append(t)
        index_text.append(i)

#### 2. Find the embeddings of all the chunks of all articles

We are using Open ai embedding

In [86]:
# Initialize the OpenAIEmbeddings instance
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [87]:
chunks_embedding = embeddings.embed_documents(splitted_texts)

#### 3. Utilize kmeans clustering for grouping of chunks in clusters.

In [159]:
kmeans_chunks = KMeans(n_clusters=6, random_state=45, n_init="auto").fit(chunks_embedding)

In [160]:
kmeans_chunks.labels_

array([2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2,
       2, 5, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 4, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

#### 4. Find the articles index present in each cluster utilizing the article index tracker we created in step 1 "index_text"
<br> One Article can be present in multiple clusters

In [161]:
cluster_to_article = {}
cluster_to_chunks = {}
for index, cluster in enumerate(kmeans_chunks.labels_):
    if cluster in cluster_to_article:
        cluster_to_article[cluster] = cluster_to_article[cluster] | {index_text[index],}
        cluster_to_chunks[cluster] = cluster_to_chunks[cluster]+[index]
    else:
        cluster_to_article[cluster] = {index_text[index],}
        cluster_to_chunks[cluster] = [index]

In [162]:
#Mapping of each cluster to the the respective article index
cluster_to_article

{2: {0, 1, 2, 7, 8, 10, 20, 21, 22, 23},
 4: {0, 15, 20},
 1: {2, 3, 4, 5, 6, 8},
 5: {9, 10, 11, 12, 13, 14, 15, 16, 17},
 3: {18, 19, 20},
 0: {24}}

In [167]:
one_hot_clusters = np.zeros((6,len(articles)))

In [168]:
for cluster_no, articles_indexes in cluster_to_article.items():
    for i in articles_indexes:
        one_hot_clusters[cluster_no, i] = 1

In [169]:
one_hot_clusters

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 1., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 1., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 0., 0., 0., 0., 0., 0.]])

In [177]:
# Save in dataframe whether a particular article is present in a cluster or not
for i in range(6):
    df[f"cluster_{i}"] = one_hot_clusters[i]

#### 5. For each cluster utilize llm map reduce to find the common theme in each cluster of articles.

In [229]:
llm = ChatOpenAI(temperature=0)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [230]:
# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and find the common theme across these summaries. Do not provide description. There can be multiple common themes among those.
Focus on the main theme for example Advancements in AI, Digital transformation in healthcare, etc. Don't return description. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [231]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [232]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [233]:
text_splitter_3 = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=50, separators=[" ", ",", "\n"])

In [234]:
def get_cluster_theme(cluster_index):
    cluster_summaries = [Document(page_content=summaries[i]) for i in cluster_to_article[cluster_index]]
    split_docs = text_splitter_3.split_documents(cluster_summaries)
    print(map_reduce_chain.run(split_docs))  

In [235]:
for i in range(6):
    print(f"Cluster_{i} Theme:\n")
    get_cluster_theme(i)
    print("\n")

Cluster_0 Theme:

Bank promotions and account requirements


Cluster_1 Theme:

Advancements in cancer treatment and therapies.


Cluster_2 Theme:

Healthcare advancements, Pharmaceutical industry developments, Legal issues in advertising, Brand management and marketing, Disinformation and misinformation.


Cluster_3 Theme:

Partnerships and collaborations in the fitness and wellness industry.


Cluster_4 Theme:

Financial performance and strategies of major companies in various industries.


Cluster_5 Theme:

Advancements in technology and innovation in the food industry.




In [238]:
theme_by_cluster_index = ["Bank promotions and account requirements",
                         "Advancements in cancer treatment and therapies",
                         "Healthcare advancements, Pharmaceutical industry developments, Legal issues in advertising, Brand management and marketing, Disinformation and misinformation",
                         "Partnerships and collaborations in the fitness and wellness industry",
                         "Financial performance and strategies of major companies in various industries",
                         "Advancements in technology and innovation in the food industry"
                         ]

Map each article to their common theme. There can be multiple themes for each article

In [253]:
article_themes = [""]*len(articles)
for i in range(len(articles)):
    for cluster, articles_set in cluster_to_article.items():
        if i in articles_set:
            if article_themes[i]=="":
                article_themes[i] = theme_by_cluster_index[cluster]
            else:
                article_themes[i] = article_themes[i]+"|"+theme_by_cluster_index[cluster]
            #article_themes[i] = article_themes[i]+"|"+theme_by_cluster_index[cluster]

In [254]:
article_themes

['Healthcare advancements, Pharmaceutical industry developments, Legal issues in advertising, Brand management and marketing, Disinformation and misinformation|Financial performance and strategies of major companies in various industries',
 'Healthcare advancements, Pharmaceutical industry developments, Legal issues in advertising, Brand management and marketing, Disinformation and misinformation',
 'Healthcare advancements, Pharmaceutical industry developments, Legal issues in advertising, Brand management and marketing, Disinformation and misinformation|Advancements in cancer treatment and therapies',
 'Advancements in cancer treatment and therapies',
 'Advancements in cancer treatment and therapies',
 'Advancements in cancer treatment and therapies',
 'Advancements in cancer treatment and therapies',
 'Healthcare advancements, Pharmaceutical industry developments, Legal issues in advertising, Brand management and marketing, Disinformation and misinformation',
 'Healthcare advancemen

In [264]:
# Save article themes in dataframe
df["common_theme"] = article_themes

### Final dataframe

In [265]:
df.tail()

Unnamed: 0,Article,vader_sentiment,abstractive_summary,sentiment_gpt,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,common_theme
20,Nike (NYSE:NKE) is the leader when it comes to...,Positive,Nike is a leading company in the sportswear i...,Positive,0,0,1,1,1,0,"Healthcare advancements, Pharmaceutical indust..."
21,Oct 23 (Reuters) - Unilever (ULVR.L) and a cha...,Negative,Unilever and a charity supporting teenage can...,Neutral,0,0,1,0,0,0,"Healthcare advancements, Pharmaceutical indust..."
22,"LONDON, Sept 19 (Reuters) - Nestle said on Tue...",Positive,Nestle has selected WPP Openmind as its exclu...,Positive,0,0,1,0,0,0,"Healthcare advancements, Pharmaceutical indust..."
23,"Get Smart About News, modeled on the Sift, is ...",Positive,"""Get Smart About News is a free newsletter an...",Neutral,0,0,1,0,0,0,"Healthcare advancements, Pharmaceutical indust..."
24,Editorial Note: Blueprint may earn a commissio...,Positive,Wells Fargo is currently offering promotions ...,Neutral,1,0,0,0,0,0,Bank promotions and account requirements


In [266]:
#Save data 
df.to_excel("Final.xlsx",index=False)