# BERT

## Deep learning project

## Marie Philippe & Claire Serraz - M2 D3S 

# 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import torch 
from transformers import BertTokenizer
from transformers import BertModel
from tqdm import tqdm 

# 2. Import data

In [2]:
# Choose the path

# Marie
os.chdir("/Users/marie/Desktop/Cours/S1/DL/Project/Data")

In [3]:
data=pd.read_csv("data.csv")

In [4]:
data.head()

Unnamed: 0,title,text,subject,date,class
0,donald trump sends embarrassing new year eve m...,donald trump wish american happy new year leav...,News,"December 31, 2017",fake
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,"December 31, 2017",fake
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,"December 30, 2017",fake
3,trump obsessed even obama name coded website i...,christmas day donald trump announced would bac...,News,"December 29, 2017",fake
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,"December 25, 2017",fake


In [5]:
data.shape

(44898, 5)

# 3. Prepare dataset for BERT

In [6]:
data = data.sample(5000,random_state=1)

In [7]:
data

Unnamed: 0,title,text,subject,date,class
4528,trump call racist policy forced every state,donald trump calling one controversial raciall...,News,"September 21, 2016",fake
31727,republican ex-defense secretary cohen back hil...,washington reuters former republican u defense...,politicsNews,"September 7, 2016",true
10937,teacher quits job 5th 6th grade muslim student...,never young commit jihad teacher primary schoo...,politics,"May 9, 2017",fake
13470,laura ingraham rip press crowd go wild job video,laura ingraham reminds never trump people bett...,politics,"Jul 21, 2016",fake
40814,germany merkel suffers state vote setback coal...,berlin/hanover reuters germany social democrat...,worldnews,"October 14, 2017",true
...,...,...,...,...,...
30035,trump leaf open possible taiwan meet question ...,palm beach fla reuters u president-elect donal...,politicsNews,"January 1, 2017",true
20551,leading n carolina newspaper girl need attempt...,leftist agenda action blurring line sexuality ...,left-news,"May 19, 2016",fake
20602,employee say facebook suppressing conservative...,know firsthand feel like victim facebook four ...,left-news,"May 9, 2016",fake
31301,u policy change cuba tough undo official,washington reuters difficult future u administ...,politicsNews,"October 14, 2016",true


In [8]:
df = data.reset_index(drop=True)
df

Unnamed: 0,title,text,subject,date,class
0,trump call racist policy forced every state,donald trump calling one controversial raciall...,News,"September 21, 2016",fake
1,republican ex-defense secretary cohen back hil...,washington reuters former republican u defense...,politicsNews,"September 7, 2016",true
2,teacher quits job 5th 6th grade muslim student...,never young commit jihad teacher primary schoo...,politics,"May 9, 2017",fake
3,laura ingraham rip press crowd go wild job video,laura ingraham reminds never trump people bett...,politics,"Jul 21, 2016",fake
4,germany merkel suffers state vote setback coal...,berlin/hanover reuters germany social democrat...,worldnews,"October 14, 2017",true
...,...,...,...,...,...
4995,trump leaf open possible taiwan meet question ...,palm beach fla reuters u president-elect donal...,politicsNews,"January 1, 2017",true
4996,leading n carolina newspaper girl need attempt...,leftist agenda action blurring line sexuality ...,left-news,"May 19, 2016",fake
4997,employee say facebook suppressing conservative...,know firsthand feel like victim facebook four ...,left-news,"May 9, 2016",fake
4998,u policy change cuba tough undo official,washington reuters difficult future u administ...,politicsNews,"October 14, 2016",true


In [9]:
df = df.astype({"text": str})

In [10]:
df.groupby(["class"]).size()

class
fake    2547
true    2453
dtype: int64

In [11]:
df = df[['text','class']]
df.head()

Unnamed: 0,text,class
0,donald trump calling one controversial raciall...,fake
1,washington reuters former republican u defense...,true
2,never young commit jihad teacher primary schoo...,fake
3,laura ingraham reminds never trump people bett...,fake
4,berlin/hanover reuters germany social democrat...,true


In [12]:
# Define the train, validation and test data
df_train, df_valid, df_test = np.split(df.sample(frac=1,random_state=10),[int(.8*len(df)), int(.9*len(df))])

In [13]:
# Get number of rows of each dataset
len(df_train)

4000

In [14]:
df_train.groupby(["class"]).size()

class
fake    2043
true    1957
dtype: int64

In [15]:
len(df_valid)

500

In [16]:
df_valid.groupby(["class"]).size()

class
fake    258
true    242
dtype: int64

In [17]:
len(df_test)

500

In [18]:
df_test.groupby(["class"]).size()

class
fake    246
true    254
dtype: int64

# 4. Custom dataset

In [19]:
# Define tokenizer and length
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
max_len   = tokenizer.max_model_input_sizes['bert-base-uncased']

In [20]:
# Dictionnary of the labels
labels = {
    'fake':0,
    'true':1,
}

In [21]:
# Custom dataset

class Dataset(torch.utils.data.Dataset): 
    
    def __init__(self,df): 
        '''
        Get labels and tokenization of the text
        '''
        self.labels = [labels[label] for label in df["class"]] 
        self.texts = [tokenizer(text, padding='max_length', max_length=max_len, 
                                truncation=True,return_tensors="pt") for text in df["text"]] 
    
    def classes(self):
        return self.labels
    
    def __len__(self): 
        return len(self.labels)
    
    def get_batch_labels(self,indx): 
        '''
        Batch of labels
        '''
        return np.array(self.labels[indx])

    def get_batch_texts(self,indx): 
        '''
        Batch of texts
        '''
        return self.texts[indx]

    def __getitem__(self,indx): 
        '''
        Item with the labels and texts
        '''
        batch_y = self.get_batch_labels(indx)
        batch_texts = self.get_batch_texts(indx)
        
        return batch_texts, batch_y

In [22]:
# Build the model

class BertClassifier(torch.nn.Module): 
    
    def __init__(self): 
        super(BertClassifier,self).__init__()
        
        self.bert=BertModel.from_pretrained("bert-base-cased")
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768,6) 
        
    def forward(self,input_id,mask): 
        
        _,pooler_output = self.bert(input_ids= input_id,attention_mask = mask,return_dict = False)
        dropout_output = self.dropout(pooler_output)
        linear_output  = self.linear(dropout_output)
        
        return linear_output

In [23]:
# Train the model

def train(model, train_data, valid_data, learning_rate, epochs=1):
    
    # Create custom data
    train, valid = Dataset(train_data), Dataset(valid_data)
    
    # Create dataloaders
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    valid_dataloader = torch.utils.data.DataLoader(valid, batch_size=1)
    
    # Processor 
    device = torch.device("cpu")
    
    # Loss
    criterion = torch.nn.CrossEntropyLoss()
    
    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
   
    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask) # Prediction
                
                # Get loss
                batch_loss = criterion(output, train_label) 
                total_loss_train += batch_loss.item()
                
                # Get accuracry
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
                
                # Update the model
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
                
            # Same procedure on the validation data
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in valid_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(f'Epochs: {epoch_num + 1} \n\
Train loss: {total_loss_train / len(train_data):6f} \n\
Train accuracy: {total_acc_train / len(train_data):6f} \n\
Validation loss: {total_loss_val / len(valid_data):6f} \n\
Validation accuracy: {total_acc_val / len(valid_data):6f}')

In [24]:
train(model = BertClassifier(), train_data = df_train, valid_data = df_valid, learning_rate = 1e-6, epochs = 1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4000/4000 [26:36:00<00:00, 23.94s/it]        


Epochs: 1 
Train loss: 0.288873 
Train accuracy: 0.925750 
Validation loss: 0.029467 
Validation accuracy: 0.996000


In [25]:
# Evaluate model

def evaluate(model, test_data):
    
    # Create custom data
    test = Dataset(test_data)
    
    # Create dataloaders
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    # Processor 
    device = torch.device("cpu")

    total_acc_test = 0
    
    with torch.no_grad():
        
        # Prediction and accurary computation
        for test_input, test_label in tqdm(test_dataloader):

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data):6f}')

In [26]:
evaluate(model = BertClassifier(), test_data = df_test)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 500/500 [13:18<00:00,  1.60s/it]

Test Accuracy: 0.486000



