In [None]:
# !pip install transformers -q
# !pip install torch -q
# !pip install git+https://github.com/gmihaila/ml_things -q

In [None]:
import io
import os
import glob
import re
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import (AutoConfig,
                         AutoModelForSequenceClassification,
                         AutoTokenizer,AdamW,
                         get_linear_schedule_with_warmup,
                         set_seed)
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix

set_seed(7)
args = Namespace()
args.MAX_LEN = 512
args.MODEL = "facebook/bart-large-mnli"
args.EPOCHS = 10
args.BATCH_SIZE = 16

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
LABEL_IDS = {'FAVOUR': 0, 'AGAINST': 1, 'NEUTRAL': 2}
ID2LABEL = {0: 'FAVOUR', 1: 'AGAINST', 2: 'NEUTRAL'}
N_LABELS = len(LABEL_IDS)

In [None]:
class loadTweetStanceDataset(Dataset):
    def __init__(self, df, tokenizer, max_sequence_len=None):
        max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
        tweets, labels = df.tweet.values.tolist(),df.stance.values.tolist()
        self.n_examples = len(labels)
        print('Using tokenizer on all texts. This can take a while...')
        self.inputs = tokenizer(tweets, add_special_tokens=True, truncation=True,
                               padding=True, return_tensors='pt', max_length=max_sequence_len)
        self.sequence_len = self.inputs['input_ids'].shape[-1]
        print('Texts padded or truncated to %d length!' % self.sequence_len)
        self.inputs.update({'labels':torch.tensor(labels)})
        print('Finished! \n')
        return
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, item):
        return {key:self.inputs[key][item] for key in self.inputs.keys()}
    
def train(dataLoader, optimizer_, scheduler_, device_, model):
    
    predicted_labels, true_labels = [], []
    total_loss = 0
    
    model.train()
    
    for batch in tqdm(dataLoader, total=len(dataLoader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
        
        model.zero_grad()
        
        outputs = model(**batch)
        loss, logits = outputs[:2]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer_.step()
        scheduler_.step()
        logits = logits.detach().cpu().numpy()
        
        predicted_labels += logits.argmax(axis=-1).flatten().tolist()
    
    avg_epoch_loss = total_loss/ len(dataLoader)
    return true_labels, predicted_labels, avg_epoch_loss

def validation(dataloader, device_, model):    
    predicted_labels, true_labels = [], []
    total_loss = 0
    
    model.eval()
    
    for batch in tqdm(dataloader, total=len(dataloader)):
        true_labels += batch['labels'].numpy().flatten().tolist()
        
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()
            predicted_label = logits.argmax(axis=-1).flatten().tolist()
            predicted_labels += predicted_label
    
    avg_epoch_loss = total_loss/ len(dataloader)
    
    return true_labels, predicted_labels, avg_epoch_loss

In [None]:
print('Loading configuration...')
model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=args.MODEL, 
                                          num_labels = N_LABELS)

print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=args.MODEL)

print('Loading model...')
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=args.MODEL, 
                                                           config=model_config)

model.to(DEVICE)
print('Model loaded to `%s`' %DEVICE)

In [None]:
df = pd.read_csv('labelled_stance_data.csv') 
df['stance'] = df['stance'].apply(lambda x: LABEL_IDS[x])
print(df.shape)

# democrats_df = df[df['name'].isin(['Amy Klobuchar', 'Bernie Sanders', 'Elizabeth Warren', 'Joe Biden', 'Kamala Harris', 'Mike Bloomberg', 
#                                    'Pete Buttigieg', 'Tom Steyer', 'Tulsi Gabbard'])]
# democrats_df['stance'] = democrats_df['stance'].apply(lambda x: LABEL_IDS[x])

# republicans_df = df[df['name'].isin(['Donald J. Trump', 'Gov. Bill Weld', 'Joe Walsh', 'Mike Pence', 'Roque "Rocky" De La Fuente'])]
# republicans_df['stance'] = republicans_df['stance'].apply(lambda x: LABEL_IDS[x])

# print(democrats_df.shape, republicans_df.shape)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
test_df, val_df = train_test_split(test_df, test_size=0.3)
print(train_df.shape, test_df.shape, val_df.shape)

In [None]:
dataset = loadTweetStanceDataset(df, tokenizer, args.MAX_LEN)
dataloader = DataLoader(dataset,batch_size=args.BATCH_SIZE)

print('Dealing with Train...')
train_dataset = loadTweetStanceDataset(train_df, tokenizer, args.MAX_LEN)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

train_dataloader = DataLoader(train_dataset, batch_size=args.BATCH_SIZE)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()
print('Dealing with Validation...')
val_dataset = loadTweetStanceDataset(val_df, tokenizer, args.MAX_LEN)
print('Created `val_dataset` with %d examples!'%len(val_dataset))

val_dataloader = DataLoader(val_dataset, batch_size=args.BATCH_SIZE)
print('Created `val_dataloader` with %d batches!'%len(val_dataloader))

print()
print('Dealing with Testing...')
test_dataset = loadTweetStanceDataset(test_df, tokenizer, args.MAX_LEN)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

test_dataloader = DataLoader(test_dataset, batch_size=args.BATCH_SIZE)
print('Created `test_dataloader` with %d batches!'%len(test_dataloader))

In [None]:
OPTIMIZER= AdamW(model.parameters(),
                 lr = 2e-5,
                 eps = 1e-8)

total_steps = len(train_dataloader) * args.EPOCHS

SCHEDULER = get_linear_schedule_with_warmup(OPTIMIZER,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

print('Epoch')
for epoch in tqdm(range(args.EPOCHS)):
  print()
  print('Training on batches...')
  train_labels, train_predicted, train_loss = train(train_dataloader, OPTIMIZER, SCHEDULER, DEVICE, model) 
  train_acc = accuracy_score(train_labels, train_predicted)

  print('Validation on batches...')
  valid_labels, valid_predicted, val_loss = validation(val_dataloader, DEVICE, model)
  val_acc = accuracy_score(valid_labels, valid_predicted)

  print('Training loss: %.5f, \t Training Accuracy: %.5f;\nValidation Loss: %.5f \t Validation Accuracy: %.5f'%(train_loss, train_acc, val_loss, val_acc))
  print()

  all_loss['train_loss'].append(train_loss)
  all_loss['val_loss'].append(val_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['val_acc'].append(val_acc)

plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Loss', use_linestyles=['-','--'])
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Accuracy', use_linestyles=['-','--'])

In [None]:
true_labels, predicted_labels, avg_epoch_loss = validation(test_dataloader, DEVICE, model)

evaluation_report = classification_report(true_labels, predicted_labels, labels=list(LABEL_IDS.values()), target_names=list(LABEL_IDS.keys()))
print(evaluation_report)

plot_confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
                      classes=list(LABEL_IDS.keys()), normalize=True,
                      magnify=0.5)

In [None]:
# torch.save(model.state_dict(), '/content/results/model.pkl')
# torch.save(model.state_dict(), '/content/results/model.pth')

### Predict with model

In [None]:
user_df = pd.read_csv('HouseGOP.csv')
topicList = [
             ['recovering economy','dropped unemployment rate','v shaped recovery','k shaped recovery','country shutdown',
              'fewer jobs','covid crisis','heated trade war','great recession','buy american','eliminate Trump tax cuts',
              'student loan debt'],
             ['elections have consequences', 'fill US Supreme court seat','supreme court nominee','get rid of affordable care act',
              'amy coney barett','supreme court appointments', 'ending the filibuster','packing the cohort'],
             ['covid crisis','deadly disease','trump panicked','save lives','economy shutdown','wear masks','vaccine','china plague',
              'reopening plan','big rallies','smaller rallies', 'dc lockdown'],
             ['race issues','equity in equality','decency','floyd murder','peaceful protest','black lives matter','generate racist hatred',
              '1994 crime bill super-predators','law enforcement','demand law and order','systemic injustice','violence is inappropriate',
              'end racial senstivity training','increase in homicides','reimagining police','community police','prosecute violence', 'weapon ban'],
             ['vote','validate counting ballots','solicited ballot','irregularity in ballot','justice ballot by supreme court','testify votes','fair election',
              'manipulating ballots','pledge to not declare victory before ballots are independenly certified','urge supporters to stay calm while counting',
              'transparency','voter fraud','voter suppression','swing vote','gerrymandering','interventionism'],
             ['impeachment hoax','con job','rebuilt military','judge vaccancies','128 openings','more divided','more violent','caused recession','weaker','putins puppy',
              'hunter','fortune in moscow, ukraine, china','federal judges','veteran affairs','bronze star'],
             ['cut drug prices','public option to obamacare','government takeover of healthcare','socialize medicine','end obamacare'],
             ['fight pandemic','lower mortality rate','vaccine','operation warp speed','distributing ventilators','wear masks','rapid testing','national standards for reopening',
              'financial resources for reopening','increase in unemployment rate','committing suicide','depression','social distancing','plexi-glass separators','protect seniors',
              'trump panicked','sell short','remdesivir','self quarantine','sanitize','exercise','eat healthy','meditate'],
             ['healthcare','health insurance','affordable care act','terminated individual mandate','end obamacare','better healthcare','building on obamacare','bidencare',
              'obamacare with public option','eliminate private insurance','affordable healthcare','healthcare is right','socialized medicine','fracking','destroying medicare',
              'destroying social security','cut medicare','poverty','hunger','raise minimum wage','bail out small businesses','immigration','zero tolerance policy',
              'reunion of families'],
             ['institutional racism','free from violence','super predators','1994 crime bill','criminal justice reform','prison reform','eliminate minimum mandatories',
              'black lives matter movement is hate','climate of hate','banned muslims','bill on drug use','no jail for drug offense'],
             ['opportunity for jobs','best carbon emission standards','climate change','global warming','health and jobs are at stake','economic growth',
              'energy independent','fracking','zero emissions','sustainable','keep frontline communities safe','renewable energy','federal subsidy to oil industry',
              'forest fires in west','paris climate accord','climate change','lowest carbon','billion tree project','forest management','maintain forests','obama plain power plan',
              'green jobs','renewable energy','electric vehicles','net zero emission by 2035','global warming','green new deal'],
             ['security of elections','iran & russia influencing elections','iran sent messages to voters','intruders will pay a price','american sovereignity','russian pawn',
              'bounties to kill soldiers in afghanistan', 'interference from foreign adversaries','biden should lose','election security','business in china','pays tax in china',
              'tax returns of trump','corruption','big man','release tax return statements','foreign entaglements','bribe ukranian','bank account in china','play by rules','denuclearization',
              'north korea was a mess','nationalism'],
             ['inauguration day','rebuilding america','best black unemployment numbers','road to success','cut taxes','new regulations','together with success','depression','401ks will go to hell',
              'inaugural address','hope over fear','chose to move forward','grow economy','deal with systemic racism','motivated by clean energy','create millions of jobs','character of the country',
              'everyone has an even chance'],
             ['black lives matter','george floyd','blm','justice for floyd','floyd protest','colorism','defunding the police','no justice no peace','solidarity','performative activism','microagression',
              'black lives matter movement','police brutality','post-racial','racially motivated violence','george floyd protests','murder of George Floyd'],
             ['capitol hill','capitol riot','capitol attack','attempted violent overthrow','armed insurrection','assault on our democracy', 'mob riot','mob rule','insurrection','capitol storming',
              'capitol crowd','capitol hill attack','capitol incident','capitol hill incident','capitol','riot','peaceful protest'],
             ['US Elections','US Elections 2020','vote for biden','vote for trump','make america great again','promises made, promises kept','our best days still lie ahead',
              'building opportunity together','working people first','fighiting for our future','we rise','win the era','a fair shot for everyone','one nation, one destiny',
              'lead with love','no more wars','sleepy joe','vote blue to save america','trump is losing','biden harris','debate 2020','election 2020','trump vs biden','voting',
              'voting rights','elections','voter fraud','super tuesday','referendum','silent majority','democratic socialist','interventionism','nationalism'],
             ['inauguration','biden harris inauguration','celebrate america','inauguration day','inauguration day 2021','celebrate with joe','vice president harris','unity','diversity','democracy'],
             ['medical marijuana','legalize marijuana','marijuana liberalization policies','marijuana decriminalization','marijuana liberalization','recreational marijuana','marijuana policy'],
             ['LGBTQ', 'community', 'equality rights', 'racial', 'ethnic', 'black', 'inequalities', 'dignity', 'reject', 'disabilities','lgbt rights','gender neutrality','inclusiveness','fair and equal treatment'],
             ['weapon ban','boycott','curtail','guns','violence', 'magazines', 'high capacity','gun control','gun violence','mass shootings','gun control laws','reduce gun violence','gun control act',
              'firearm owners protection act','assault weapons ban','march for our lives','international gun control','brady law','open carry','background checks'],
             ['express tour','trains','labour unions','build back express tour','backbone of america','amtrack train tour','dignity of work','union rights','i will fight for you','middle class built america'],
             ['maternal', 'healthcare', 'obamacare', 'medicare', 'affordable', 'clean', 'health', 'care','health coverage','medicaid eligibility','insurance coverage','US citizens','legal residents',
              'reform private insurance','MMA','medicare modernization act','quality of health care','merit based incentive payment system','redducing health care costs']]

In [None]:
def isPhraseIn(phrase, text):
    return re.search(r"\b{}\b".format(phrase), text, re.IGNORECASE) is not None

In [None]:
def topicWiseStanceDetection(user_df, topicList):
    print(user_df.shape)

    # Calculate stance over each topic
    for topic_keywords in topicList:
        print(topic_keywords)

        topic_df = pd.DataFrame(columns = user_df.columns)

        for index, row in user_df.iterrows():
            for phrase in topic_keywords:
                if isinstance(row.tweet, float):
                    row.tweet = str(row.tweet)
                if isPhraseIn(phrase, row.tweet):
                    topic_df.loc[index] = row
        
        topic_df.drop_duplicates()

        for index, row in topic_df.iterrows():
            inputs = tokenizer((str(row.tweet)).lower(), return_tensors='pt')
            inputs.to(DEVICE)
            output = model(**inputs)
            predicted_stance_probabilities = torch.softmax(output[0], dim=1)[0].tolist()
            predicted_stance = ID2LABEL[np.argmax(predicted_stance_probabilities)]
            topic_df.at[index, 'stance'] = predicted_stance

        # Get the list of tweets as per the stance
        if len(topic_df[topic_df.stance == 'FAVOUR']) == 0:
            favour_tweets_list = []
        else:
             favour_tweets_list = topic_df[topic_df.stance == 'FAVOUR'].tweet
        if len(topic_df[topic_df == 'AGAINST']) == 0:
             against_tweets_list = []
        else:
            against_tweets_list = topic_df[topic_df.stance == 'AGAINST'].tweet
        if len(topic_df[topic_df.stance == 'NEUTRAL']) == 0:
             neutral_tweets_list = []
        else: 
             neutral_tweets_list = topic_df[topic_df.stance == 'NEUTRAL'].tweet

        # Percentage of tweets as per the stance
        favour_tweets_percentage = np.round(len(favour_tweets_list)/ topic_df.shape[0] * 100, 3)
        against_tweets_percentage = np.round(len(against_tweets_list)/ topic_df.shape[0] * 100, 3)
        neutral_tweets_percentage = np.round(len(neutral_tweets_list)/ topic_df.shape[0] * 100, 3)

        stanceCountTuple = (len(favour_tweets_list), len(against_tweets_list), len(neutral_tweets_list))
        stancePercentTuple = (favour_tweets_percentage, against_tweets_percentage, neutral_tweets_percentage)

        print(topic_df.shape, str(stanceCountTuple))
        print(stancePercentTuple)
        print('=============================================================================================================')

In [None]:
user_df.columns

In [None]:
topicWiseStanceDetection(user_df, topicList)