In [1]:

import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')


In [5]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)



In [6]:
import pandas as pd
df = pd.read_csv("table_pharma_40Topics_sentenceLevel_topic_analysis_user_defined.csv")

top_sentence_list_for_topics = []
for i in range(len(df.columns[45:])):
    top_sentence_list_for_topics.append(df[df[df.columns[45+i]] >0.8]["Sentences"])
    
import numpy as np
def calculate_deciles(list):
    #create data
    data = np.array(list)

    #calculate deciles of data
    deciles = np.percentile(data, np.arange(0, 100, 10))
    return deciles

topics_top_sentence_list = []
for i in range(len(df.columns[5:45])):
    column_name = df.columns[5+i]
    column_deciles = calculate_deciles(df[column_name].tolist())
    topics_top_sentence_list.append(df[df[df.columns[5+i]].between(column_deciles[-2],column_deciles[-1])]["Sentences"])
    
# env_data = []
# for i in top_sentence_list_for_topics[0]:
#     env_data.append((i,'environment'))
# env_data

In [7]:
train_data= []
for i in range(len(top_sentence_list_for_topics)):
    count = -1
    for j in top_sentence_list_for_topics[i]:
        if count == 3:
            break
        else:
            train_data.append((j,df.columns[45+i]))
            count = count+1
train_data

[('talENtED PEOPlE We depend on the skills and creativity of our employees to discover, develop and produce new medicines, and deliver them to patients.',
  'Environment'),
 ('Our environment We live in an era of amazing medical innovation, driven by better understanding of the genetic and biological roots of disease, and surging use of data analytics and digital technology in science and healthcare.',
  'Environment'),
 ('We aspire to be a leader on environmental, social and governance topics and to build trust with society.',
  'Environment'),
 ('Thank you for taking the time to learn more about how Novartis continues to make meaningful progress on environ- mental, social and governance (ESG) topics.',
  'Environment'),
 ('At the same time, we are taking steps to minimize our negative environmental impact, as measured by the carbon, other air emissions, water and waste impacts of our own opera- tions and supply chain, which were val- ued at USD 5.3 billion.',
  'GHG Emissions'),
 ('W

In [8]:
sentence_short_phrases = train_data

In [9]:
t5_model.train()

epochs = 10

for epoch in range(epochs):
  print ("epoch ",epoch)
  for input,output in sentence_short_phrases:
    input_sent = "shorten: "+input+ " </s>"
    ouput_sent = output+" </s>"

    tokenized_inp = tokenizer.encode_plus(input_sent,  max_length=96, pad_to_max_length=True,return_tensors="pt",truncation=True)
    tokenized_output = tokenizer.encode_plus(ouput_sent, max_length=96, pad_to_max_length=True,return_tensors="pt",truncation=True)


    input_ids  = tokenized_inp["input_ids"]
    attention_mask = tokenized_inp["attention_mask"]

    lm_labels= tokenized_output["input_ids"]
    decoder_attention_mask=  tokenized_output["attention_mask"]


    # the forward function automatically creates the correct decoder_input_ids
    output = t5_model(input_ids=input_ids, labels=lm_labels,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
    loss = output[0]

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

epoch  0




epoch  1
epoch  2
epoch  3
epoch  4
epoch  5
epoch  6
epoch  7
epoch  8
epoch  9


In [10]:
topics_top_sentence_list

[8        We also aim to reward those who invest their m...
 19        tEChNOlOGY We use artificial intelligence, ge...
 29       We develop and produce innovative medicines to...
 42       In our pursuit of trans- formative treatments,...
 43       We are rethinking how we work, embracing agile...
                                ...                        
 12773     corroborate the qualitative information (acti...
 12780     for the key performance indicators and other ...
 12804     20-F: Item 4, B.6.1., Marketing and distribut...
 12806     20-F: Item 4, B.6.1., Marketing and distribut...
 12812     20-F: Item 4, B.1., Strategy 20-F: Item 4, B....
 Name: Sentences, Length: 1282, dtype: object,
 8        We also aim to reward those who invest their m...
 9        Novartis Pharmaceuticals Novartis Pharmaceutic...
 27       Our products Our products address most major d...
 35       The rise in the average yearly number of new d...
 37       The projected number of people in the world

In [11]:
test_data= []
for i in range(len(topics_top_sentence_list)):
    count = -1
    for j in topics_top_sentence_list[i]:
        if count == 3:
            break
        else:
            test_data.append(j)
            count = count+1
test_data

['We also aim to reward those who invest their money, time and ideas in our company.',
 ' tEChNOlOGY We use artificial intelligence, gene editing and other cutting-edge tech- nologies to spur innovation and increase efficiency.',
 'We develop and produce innovative medicines to address patient needs in disease areas where our experience and knowledge have the potential to produce transformative treatments.',
 'In our pursuit of trans- formative treatments, we challenge medical paradigms and explore possibilities to cure disease, intervene earlier in chronic illnesses, and find ways to dramatically improve quality of life.',
 'We also aim to reward those who invest their money, time and ideas in our company.',
 'Novartis Pharmaceuticals Novartis Pharmaceuticals focuses on patented treat - ments in multiple disease areas to enhance health outcomes for patients and offer solutions to healthcare providers.',
 'Our products Our products address most major disease areas and are sold in appro

In [12]:
with open('sentence_to_topic.txt', 'w', encoding="utf-8") as file: 
    for i in test_data:
        print(i)
        file.write("Sentence : "+i+"\n")
        file.write("Topic : \n")
        test_sent = 'shorten: '+i
        test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

        test_input_ids  = test_tokenized["input_ids"]
        test_attention_mask = test_tokenized["attention_mask"]

        t5_model.eval()
        beam_outputs = t5_model.generate(
            input_ids=test_input_ids,attention_mask=test_attention_mask,
            max_length=64,
            early_stopping=True,
            num_beams=10,
            num_return_sequences=5,
            no_repeat_ngram_size=2
        )

        for beam_output in beam_outputs:
            sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            file.write(sent+"\n")
            print (sent)
        file.write("\n")
        file.write("---------------------------------------------------------------------------------------------\n\n")
        print("_________________")
    file.close()

We also aim to reward those who invest their money, time and ideas in our company.
Customer Welfare
Customer Welfare and Community Relations
Customer Welfare and Welfare
Customer Welfare and Employee Welfare
Customer Welfare and Diversity
_________________
 tEChNOlOGY We use artificial intelligence, gene editing and other cutting-edge tech- nologies to spur innovation and increase efficiency.
Business Model and Innovation
Artificial Intelligence
Business Model Resilience
Environment
Systemic Risk Management
_________________
We develop and produce innovative medicines to address patient needs in disease areas where our experience and knowledge have the potential to produce transformative treatments.
Business Model and Innovation
Access and Affordability
Product Design and Lifecycle Management
Customer Welfare
Business Model Resilience
_________________
In our pursuit of trans- formative treatments, we challenge medical paradigms and explore possibilities to cure disease, intervene earl

Business Model and Innovation
Business Model Resilience
Business Model & Innovation
Business Model and Technology
Business Model and Product Design
_________________
We also aim to reward those who invest their money, time and ideas in our company.
Customer Welfare
Customer Welfare and Community Relations
Customer Welfare and Welfare
Customer Welfare and Employee Welfare
Customer Welfare and Diversity
_________________
CORPORATE FUNCTIONS support the enterprise in specific areas of expertise, including finance, human resources, legal and commu- nications.
Human Capital
Human Capital and Business Ethics
Leadership and Governance
HR & Human Capital
Human Capital and Community Relations
_________________
Photo A worker at Ziplines distribution center in Omenako, Ghana, prepares a medical order for drone delivery.
Business Model and Innovation
Business Model Resilience
Business Ethics
Customer Welfare
Business Modeling and Innovation
_________________
Novartis has partnered with Zipline, a

Selling Practices and Product Labeling
Product Design and Lifecycle Management
Management of the Legal Regulatory Environment
Business Model Resilience
Business Model and Innovation
_________________
The projected value of the global digital health market by 2023 (USD), a 60% increase from 2019, according to the Frost & Sullivan Global Digital Health Outlook 2020
Business Model and Innovation
Digital Health and Safety
Business Model Resilience
Digital Health and Human Capital
Digital Health and Wellness
_________________
We are rethinking how we work, embracing agile teams and building better productivity into our company to free resources that we can invest in innovation and help boost returns.
Business Model and Innovation
Business Model Resilience
Business Model, Innovation and Innovation
Business Model for Innovation and Innovation
Business Model of Innovation and Innovation
_________________
Our ESG-related commitments are not add-ons to our business they perme- ate Novartis to th

Access and Affordability
Access, and Affordability
Access and Affordability Management
Access & Affordability
Business Model and Innovation
_________________
Novartis Pharmaceuticals Novartis Pharmaceuticals focuses on patented treat - ments in multiple disease areas to enhance health outcomes for patients and offer solutions to healthcare providers.
Business Model and Innovation
Business Model Resilience
Management of the Legal Regulatory Environment
Selling Practices and Product Labeling
Access and Affordability
_________________
Photo A worker at Ziplines distribution center in Omenako, Ghana, prepares a medical order for drone delivery.
Business Model and Innovation
Business Model Resilience
Business Ethics
Customer Welfare
Business Modeling and Innovation
_________________
Novartis has partnered with Zipline, a US-based automated logistics company, to help deliver vital medicines to remote areas.
Business Model and Innovation
Business Model Resilience
Business Model & Innovation
B

Business Model and Innovation
Access and Affordability
Business Model and Innovation in Medicine
Product Design and Lifecycle Management
Systemic Risk Management
_________________
We also announced plans to shift from maximiz- ing profit to maximizing access in sub- Saharan Africa, which has the worlds most underserved patient population.
Business Model Resilience
Business Model and Innovation
Systemic Risk Management
Access and Accessibility
Business Model and Governance
_________________
We made significant progress in global health, including on our efforts to elim- inate leprosy and malaria.
Global Health and Safety
Leadership and Governance
Human Health and Safety
Business Ethics
Health and Safety
_________________
Our teams also took important steps to help ensure we meet our com- mitment to become carbon neutral in our own operations by 2025.
GHG Emissions
Physical Impacts of Climate Change
Air Quality
Business Ethics
Energy Management
_________________
We use innovative science

Business Model and Innovation
Business Model Resilience
Business Model & Innovation
Business Model and Technology
Business Model and Product Design
_________________
We use innovative science and technology to address some of societys most challenging healthcare issues.
Access and Affordability
Access, and Affordability
Access and Affordability Management
Access & Affordability
Business Model and Innovation
_________________
We also aim to reward those who invest their money, time and ideas in our company.
Customer Welfare
Customer Welfare and Community Relations
Customer Welfare and Welfare
Customer Welfare and Employee Welfare
Customer Welfare and Diversity
_________________
NOVARTIS TECHNICAL OPERATIONS (NTO) is responsible for making our innovative medicines, devices and Sandoz products and delivering them to our customers across the world.
Business Model and Innovation
Product Design and Lifecycle Management
Selling Practices and Product Labeling
Management of the Legal Regulatory