In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
!pip install kaggle



In [13]:
# Create the kaggle directory and read the uploaded kaggle.json file
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [14]:
!chmod 600 /root/.kaggle/kaggle.json

In [15]:
# Download dataset
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 88% 36.0M/41.0M [00:00<00:00, 156MB/s] 
100% 41.0M/41.0M [00:00<00:00, 143MB/s]


In [16]:
# Unzip folder in Colab content folder
!unzip /content/fake-and-real-news-dataset.zip

Archive:  /content/fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


In [17]:
!ls

 fake-and-real-news-dataset.zip   kaggle.json	       sample_data
 Fake.csv			 'news_posts(1).csv'   True.csv


In [18]:
df1 = pd.read_csv("Fake.csv")
df2 = pd.read_csv("True.csv")
df1["y"] = ["False"]*len(df1)
df2["y"] = ["True"]*len(df2)

In [19]:
frames = [df1, df2]
corpus = pd.concat(frames, ignore_index=True)

In [20]:
corpus

Unnamed: 0,title,text,subject,date,y
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",False
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",False
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",False
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",False
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",False
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True


In [21]:
corpus.describe()

Unnamed: 0,title,text,subject,date,y
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",False
freq,14,627.0,11272,182,23481


In [22]:
corpus["x"] = corpus["title"]

In [23]:
corpus

Unnamed: 0,title,text,subject,date,y,x
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",False,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",False,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",False,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",False,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",False,Pope Francis Just Called Out Donald Trump Dur...
...,...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True,'Fully committed' NATO backs new U.S. approach...
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True,LexisNexis withdrew two products from Chinese ...
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True,Minsk cultural hub becomes haven from authorities
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True,Vatican upbeat on possibility of Pope Francis ...


In [24]:
#Drop additional columns
corpus.drop(columns=["title","text", "subject", "date"], inplace=True)

#Drop blank rows
corpus['x'].dropna(inplace = True)

#Convert text to lowercase
corpus['x'] = [text.lower() for text in corpus['x']]

#Tokenization
corpus['x'] = [word_tokenize(text) for text in corpus['x']]

#WordNetLemmatizer
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [25]:
corpus

Unnamed: 0,y,x
0,False,"[donald, trump, sends, out, embarrassing, new,..."
1,False,"[drunk, bragging, trump, staffer, started, rus..."
2,False,"[sheriff, david, clarke, becomes, an, internet..."
3,False,"[trump, is, so, obsessed, he, even, has, obama..."
4,False,"[pope, francis, just, called, out, donald, tru..."
...,...,...
44893,True,"['fully, committed, ', nato, backs, new, u.s.,..."
44894,True,"[lexisnexis, withdrew, two, products, from, ch..."
44895,True,"[minsk, cultural, hub, becomes, haven, from, a..."
44896,True,"[vatican, upbeat, on, possibility, of, pope, f..."


In [26]:
membersProcessed = 0
for idx, text in enumerate(corpus['x']):
  finalWords = []
  word_net_lemmatizer = WordNetLemmatizer()
  set_stop = set(stopwords.words('english'))
  iterate = pos_tag(text)
  [finalWords.append(word_net_lemmatizer.lemmatize(word, tag_map[tag[0]])) for word, tag in iterate if word not in set_stop and word.isalpha()]
  corpus.loc[idx, 'finalText'] = str(finalWords)
  membersProcessed+=1
  print('Progress: {}/{} members processed'.format(membersProcessed, len(corpus)))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Progress: 39899/44898 members processed
Progress: 39900/44898 members processed
Progress: 39901/44898 members processed
Progress: 39902/44898 members processed
Progress: 39903/44898 members processed
Progress: 39904/44898 members processed
Progress: 39905/44898 members processed
Progress: 39906/44898 members processed
Progress: 39907/44898 members processed
Progress: 39908/44898 members processed
Progress: 39909/44898 members processed
Progress: 39910/44898 members processed
Progress: 39911/44898 members processed
Progress: 39912/44898 members processed
Progress: 39913/44898 members processed
Progress: 39914/44898 members processed
Progress: 39915/44898 members processed
Progress: 39916/44898 members processed
Progress: 39917/44898 members processed
Progress: 39918/44898 members processed
Progress: 39919/44898 members processed
Progress: 39920/44898 members processed
Progress: 39921/44898 members processed
Progress: 39922

In [27]:
y_encoder = LabelEncoder()
corpus['y'] = y_encoder.fit_transform(corpus['y'])

In [28]:
corpus

Unnamed: 0,y,x,finalText
0,0,"[donald, trump, sends, out, embarrassing, new,...","['donald', 'trump', 'send', 'embarrass', 'new'..."
1,0,"[drunk, bragging, trump, staffer, started, rus...","['drunk', 'brag', 'trump', 'staffer', 'start',..."
2,0,"[sheriff, david, clarke, becomes, an, internet...","['sheriff', 'david', 'clarke', 'become', 'inte..."
3,0,"[trump, is, so, obsessed, he, even, has, obama...","['trump', 'obsessed', 'even', 'obama', 'name',..."
4,0,"[pope, francis, just, called, out, donald, tru...","['pope', 'francis', 'call', 'donald', 'trump',..."
...,...,...,...
44893,1,"['fully, committed, ', nato, backs, new, u.s.,...","['commit', 'nato', 'back', 'new', 'approach', ..."
44894,1,"[lexisnexis, withdrew, two, products, from, ch...","['lexisnexis', 'withdraw', 'two', 'product', '..."
44895,1,"[minsk, cultural, hub, becomes, haven, from, a...","['minsk', 'cultural', 'hub', 'becomes', 'autho..."
44896,1,"[vatican, upbeat, on, possibility, of, pope, f...","['vatican', 'upbeat', 'possibility', 'pope', '..."


In [29]:
from sklearn.model_selection import KFold

In [30]:
kf = KFold(n_splits=10)

**Electra**

In [31]:
X = corpus['finalText']
y = corpus['y']

In [32]:
import random
import copy
import time
import pandas as pd
import numpy as np
import gc
import re
import torch as t

#import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import os 

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from multiprocessing import  Pool
from functools import partial
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

In [33]:
!pip install -U torchtext==0.8.0



In [34]:
!pip install transformers



In [35]:
pip install simpletransformers



In [36]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
np.random.seed(500)

In [38]:
from sklearn import model_selection
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

**Reddit Dataset**

In [39]:
df = pd.read_csv('/content/news_posts(1).csv')
df

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL
0,0,People Are Accusing Robinhood Of Stealing From...,,l7afyx,181017,4408,https://www.buzzfeednews.com/article/clarissaj...
1,1,US Military Could Lose Space Force Trademark t...,,gyzw2p,129257,2844,https://www.cbr.com/us-military-lose-space-for...
2,2,White House threatens to fire anyone who tries...,,jrskag,126468,4141,https://americanindependent.com/white-house-th...
3,3,Meta's threat to close down Facebook and Insta...,,so0ree,126258,3786,https://www.cityam.com/metas-threat-to-close-d...
4,4,Don't eat or inject yourself with disinfectant...,,g6zci5,125437,7024,https://www.cnn.com/world/live-news/coronaviru...
...,...,...,...,...,...,...,...
4898,987,Colorado pizza delivery driver saves mans life...,,3b37gf,24221,898,http://www.postindependent.com/news/16943384-1...
4899,988,"Utah Woman Donates Over 1,200 Handmade Toys to...",,90pbng,24216,888,https://www.insideedition.com/utah-woman-donat...
4900,989,Year of the Tiger marks increase in tiger popu...,,sinyqg,24197,205,https://democratic-europe.eu/2022/02/01/%ef%bf...
4901,990,Florida cops deliver dresses made by a 99-year...,,8q8njr,24164,183,http://www.miamiherald.com/news/state/florida/...


In [40]:
df['X_test'] = df['Title']
df

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,X_test
0,0,People Are Accusing Robinhood Of Stealing From...,,l7afyx,181017,4408,https://www.buzzfeednews.com/article/clarissaj...,People Are Accusing Robinhood Of Stealing From...
1,1,US Military Could Lose Space Force Trademark t...,,gyzw2p,129257,2844,https://www.cbr.com/us-military-lose-space-for...,US Military Could Lose Space Force Trademark t...
2,2,White House threatens to fire anyone who tries...,,jrskag,126468,4141,https://americanindependent.com/white-house-th...,White House threatens to fire anyone who tries...
3,3,Meta's threat to close down Facebook and Insta...,,so0ree,126258,3786,https://www.cityam.com/metas-threat-to-close-d...,Meta's threat to close down Facebook and Insta...
4,4,Don't eat or inject yourself with disinfectant...,,g6zci5,125437,7024,https://www.cnn.com/world/live-news/coronaviru...,Don't eat or inject yourself with disinfectant...
...,...,...,...,...,...,...,...,...
4898,987,Colorado pizza delivery driver saves mans life...,,3b37gf,24221,898,http://www.postindependent.com/news/16943384-1...,Colorado pizza delivery driver saves mans life...
4899,988,"Utah Woman Donates Over 1,200 Handmade Toys to...",,90pbng,24216,888,https://www.insideedition.com/utah-woman-donat...,"Utah Woman Donates Over 1,200 Handmade Toys to..."
4900,989,Year of the Tiger marks increase in tiger popu...,,sinyqg,24197,205,https://democratic-europe.eu/2022/02/01/%ef%bf...,Year of the Tiger marks increase in tiger popu...
4901,990,Florida cops deliver dresses made by a 99-year...,,8q8njr,24164,183,http://www.miamiherald.com/news/state/florida/...,Florida cops deliver dresses made by a 99-year...


In [41]:
#Drop additional columns
df.drop(columns=["Title","Post Text", "ID", "Score","Unnamed: 0","Post URL","Total Comments"], inplace=True)

#Drop blank rows
df['X_test'].dropna(inplace = True)

#Convert text to lowercase
df['X_test'] = [text.lower() for text in df['X_test']]

#Tokenization
df['X_test'] = [word_tokenize(text) for text in df['X_test']]

#WordNetLemmatizer
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [42]:
membersProcessed1 = 0
for idx, text in enumerate(df['X_test']):
  finalWords = []
  word_net_lemmatizer = WordNetLemmatizer()
  set_stop = set(stopwords.words('english'))
  iterate = pos_tag(text)
  [finalWords.append(word_net_lemmatizer.lemmatize(word, tag_map[tag[0]])) for word, tag in iterate if word not in set_stop and word.isalpha()]
  df.loc[idx, 'finalText'] = str(finalWords)
  membersProcessed1 +=1
  print('Progress: {}/{} members processed'.format(membersProcessed1, len(df)))

Progress: 1/4903 members processed
Progress: 2/4903 members processed
Progress: 3/4903 members processed
Progress: 4/4903 members processed
Progress: 5/4903 members processed
Progress: 6/4903 members processed
Progress: 7/4903 members processed
Progress: 8/4903 members processed
Progress: 9/4903 members processed
Progress: 10/4903 members processed
Progress: 11/4903 members processed
Progress: 12/4903 members processed
Progress: 13/4903 members processed
Progress: 14/4903 members processed
Progress: 15/4903 members processed
Progress: 16/4903 members processed
Progress: 17/4903 members processed
Progress: 18/4903 members processed
Progress: 19/4903 members processed
Progress: 20/4903 members processed
Progress: 21/4903 members processed
Progress: 22/4903 members processed
Progress: 23/4903 members processed
Progress: 24/4903 members processed
Progress: 25/4903 members processed
Progress: 26/4903 members processed
Progress: 27/4903 members processed
Progress: 28/4903 members processed
P

In [43]:
df_test = df['finalText']
df_test

0       ['people', 'accuse', 'robinhood', 'steal', 'po...
1       ['u', 'military', 'could', 'lose', 'space', 'f...
2       ['white', 'house', 'threaten', 'fire', 'anyone...
3       ['meta', 'threat', 'close', 'facebook', 'insta...
4       ['eat', 'inject', 'disinfectant', 'warn', 'fda...
                              ...                        
4898    ['colorado', 'pizza', 'delivery', 'driver', 's...
4899    ['utah', 'woman', 'donate', 'handmade', 'toy',...
4900    ['year', 'tiger', 'mark', 'increase', 'tiger',...
4901    ['florida', 'cop', 'deliver', 'dress', 'make',...
4902    ['mcdonald', 'worker', 'recieves', 'online', '...
Name: finalText, Length: 4903, dtype: object

In [44]:
Y_test1 = [True]*4903
df['Y_test1'] = Y_test1
df['Y_test1']

0       True
1       True
2       True
3       True
4       True
        ... 
4898    True
4899    True
4900    True
4901    True
4902    True
Name: Y_test1, Length: 4903, dtype: bool

In [45]:
y_encoder = LabelEncoder()
df['Y_test1'] = y_encoder.fit_transform(df['Y_test1'])

In [46]:
df_xtest = df['finalText']
df_ytest = df['Y_test1']

In [47]:
x1 = pd.concat([corpus['finalText'], df['finalText']])
y1 = pd.concat([corpus['y'], df['Y_test1']])

In [48]:
df3 = x1.to_frame()

In [49]:
df3

Unnamed: 0,finalText
0,"['donald', 'trump', 'send', 'embarrass', 'new'..."
1,"['drunk', 'brag', 'trump', 'staffer', 'start',..."
2,"['sheriff', 'david', 'clarke', 'become', 'inte..."
3,"['trump', 'obsessed', 'even', 'obama', 'name',..."
4,"['pope', 'francis', 'call', 'donald', 'trump',..."
...,...
4898,"['colorado', 'pizza', 'delivery', 'driver', 's..."
4899,"['utah', 'woman', 'donate', 'handmade', 'toy',..."
4900,"['year', 'tiger', 'mark', 'increase', 'tiger',..."
4901,"['florida', 'cop', 'deliver', 'dress', 'make',..."


In [50]:
df3['labels'] = y1

In [51]:
df3

Unnamed: 0,finalText,labels
0,"['donald', 'trump', 'send', 'embarrass', 'new'...",0
1,"['drunk', 'brag', 'trump', 'staffer', 'start',...",0
2,"['sheriff', 'david', 'clarke', 'become', 'inte...",0
3,"['trump', 'obsessed', 'even', 'obama', 'name',...",0
4,"['pope', 'francis', 'call', 'donald', 'trump',...",0
...,...,...
4898,"['colorado', 'pizza', 'delivery', 'driver', 's...",0
4899,"['utah', 'woman', 'donate', 'handmade', 'toy',...",0
4900,"['year', 'tiger', 'mark', 'increase', 'tiger',...",0
4901,"['florida', 'cop', 'deliver', 'dress', 'make',...",0


In [10]:
from simpletransformers.classification import MultiLabelClassificationModel


# Create a ClassificationModel
model = MultiLabelClassificationModel('electra', 'google/electra-small-discriminator', num_labels=2)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/51.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForMultiLabelSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForMultiLabelSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [54]:
# Create a TransformerModel with modified attributes
from simpletransformers.model import TransformerModel
model = TransformerModel('electra', 'google/electra-small-discriminator', num_labels=2, args={'learning_rate':1e-5, 'num_train_epochs': 3, 'reprocess_input_data': True, 'overwrite_output_dir': True})

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

In [55]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df3, test_size=0.2)


In [56]:
model.train_model(train)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/39840 [00:00<?, ?it/s]



Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 0 of 3:   0%|          | 0/4980 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/4980 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/4980 [00:00<?, ?it/s]

(14940, 0.2851439482745636)

In [57]:
from sklearn.metrics import f1_score, accuracy_score


def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')
    
result, model_outputs, wrong_predictions = model.eval_model(test, f1=f1_multiclass, acc=accuracy_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/9961 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1246 [00:00<?, ?it/s]

In [58]:
print(result)
# print(model_outputs)
# print(wrong_predictions)

{'mcc': 0.8624325568166513, 'tp': 4115, 'tn': 5159, 'fp': 501, 'fn': 186, 'auroc': 0.9825721769035551, 'auprc': 0.9736898691200577, 'f1': 0.9310310209818291, 'acc': 0.9310310209818291, 'eval_loss': 0.2746645866580798}
