# Importing Libraries

In [None]:
!pip install langdetect
!pip install transformers



In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re  
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import langdetect
from langdetect import detect # language detection

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reading Data

In [None]:
from google.colab import auth
auth.authenticate_user()

from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
from oauth2client.client import GoogleCredentials
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
link = 'https://drive.google.com/file/d/1v9xL-cWgvly83FwpNYoqY6k3LlLbA38Y/view?usp=sharing' # The shareable link
  
# to get the id part of the file 
id = link.split("/")[-2]

myfile = drive.CreateFile({'id':id})
myfile.GetContentFile('clean_data.csv')

In [None]:
df_model = pd.read_csv('clean_data.csv', index_col=0)
df_model

Unnamed: 0,id,ended,length,type,label,tidy_text,token_words
0,47018,True,102,train,real,The overarching quality the Bloomberg era was ...,"['The', 'overarching', 'quality', 'Bloomberg',..."
1,94338,True,142,train,real,This about bad morning commute gets all winter...,"['This', 'bad', 'morning', 'commute', 'gets', ..."
2,44507,True,631,train,real,More Saskatchewan Liquor Privatization latest ...,"['More', 'Saskatchewan', 'Liquor', 'Privatizat..."
3,163493,False,1024,train,real,Killen was here Patrick Killen helped define t...,"['Killen', 'Patrick', 'Killen', 'helped', 'def..."
4,91925,True,51,train,real,Mailbox Rental Mail Forwarding Services Delawa...,"['Mailbox', 'Rental', 'Mail', 'Forwarding', 'S..."
...,...,...,...,...,...,...,...
99995,88658,True,548,train,GPT-2,The official website for Donten Warau Maken an...,"['The', 'official', 'website', 'Donten', 'Wara..."
99996,7575,True,803,train,GPT-2,have long slow and somewhat unproductive relat...,"['long', 'slow', 'somewhat', 'unproductive', '..."
99997,38428,False,1024,train,GPT-2,Flexibility Not Option The Nourish School use ...,"['Flexibility', 'Not', 'Option', 'The', 'Nouri..."
99998,129005,True,137,train,GPT-2,Duck Dynasty Phil Robertson will have make due...,"['Duck', 'Dynasty', 'Phil', 'Robertson', 'make..."


In [None]:
type(df_model)

pandas.core.frame.DataFrame

# Word Probabilities

## GLTR

http://gltr.io/dist/index.html

https://github.com/HendrikStrobelt/detecting-fake-text


In [None]:
!git clone https://github.com/HendrikStrobelt/detecting-fake-text.git

fatal: destination path 'detecting-fake-text' already exists and is not an empty directory.


In [None]:
%cd /content/detecting-fake-text
!pip install -r requirements.txt

/content/detecting-fake-text


In [None]:
import numpy as np
import torch
import time
import nltk

from pytorch_pretrained_bert import (GPT2LMHeadModel, GPT2Tokenizer,
                                     BertTokenizer, BertForMaskedLM)

from matplotlib import pyplot as plt

class AbstractLanguageChecker():
    """
    Abstract Class that defines the Backend API of GLTR.

    To extend the GLTR interface, you need to inherit this and
    fill in the defined functions.
    """

    def __init__(self):
        '''
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        '''
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

    def check_probabilities(self, in_text, topk=40):
        '''
        Function that GLTR interacts with to check the probabilities of words

        Params:
        - in_text: str -- The text that you want to check
        - topk: int -- Your desired truncation of the head of the distribution

        Output:
        - payload: dict -- The wrapper for results in this function, described below

        Payload values
        ==============
        bpe_strings: list of str -- Each individual token in the text
        real_topk: list of tuples -- (ranking, prob) of each token
        pred_topk: list of list of tuple -- (word, prob) for all topk
        '''
        raise NotImplementedError

    def postprocess(self, token):
        """
        clean up the tokens from any special chars and encode
        leading space by UTF-8 code '\u0120', linebreak with UTF-8 code 266 '\u010A'
        :param token:  str -- raw token text
        :return: str -- cleaned and re-encoded token text
        """
        raise NotImplementedError


def top_k_logits(logits, k):
    '''
    Filters logits to only the top k choices
    from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py
    '''
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1]
    return torch.where(logits < min_values,
                       torch.ones_like(logits, dtype=logits.dtype) * -1e10,
                       logits)



class LM(AbstractLanguageChecker):
    def __init__(self, model_name_or_path="gpt2"):
        super(LM, self).__init__()
        self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
        self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
        self.model.to(self.device)
        self.model.eval()
        self.start_token = '<|endoftext|>'
        # print("Loaded GPT-2 model!")

    def check_probabilities(self, in_text, topk=40):
        # Process input
        start_t = torch.full((1, 1),
                             self.enc.encoder[self.start_token],
                             device=self.device,
                             dtype=torch.long)
        context = self.enc.encode(in_text)
        context = torch.tensor(context,
                               device=self.device,
                               dtype=torch.long).unsqueeze(0)
        context = torch.cat([start_t, context], dim=1)
        # Forward through the model
        logits, _ = self.model(context)

        # construct target and pred
        yhat = torch.softmax(logits[0, :-1], dim=-1)
        y = context[0, 1:]
        # Sort the predictions for each timestep
        sorted_preds = np.argsort(-yhat.data.cpu().numpy())
        # [(pos, prob), ...]
        real_topk_pos = list(
            [int(np.where(sorted_preds[i] == y[i].item())[0][0])
             for i in range(y.shape[0])])
        real_topk_probs = yhat[np.arange(
            0, y.shape[0], 1), y].data.cpu().numpy().tolist()
        real_topk_probs = list(map(lambda x: round(x, 5), real_topk_probs))

        real_topk = list(zip(real_topk_pos, real_topk_probs))
        # [str, str, ...]
        bpe_strings = [self.enc.decoder[s.item()] for s in context[0]]

        bpe_strings = [self.postprocess(s) for s in bpe_strings]

        # [[(pos, prob), ...], [(pos, prob), ..], ...]
        pred_topk = [
            list(zip([self.enc.decoder[p] for p in sorted_preds[i][:topk]],
                     list(map(lambda x: round(x, 5),
                              yhat[i][sorted_preds[i][
                                      :topk]].data.cpu().numpy().tolist()))))
            for i in range(y.shape[0])]

        pred_topk = [[(self.postprocess(t[0]), t[1]) for t in pred] for pred in pred_topk]
        payload = {'bpe_strings': bpe_strings,
                   'real_topk': real_topk,
                   'pred_topk': pred_topk}
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return payload

    def sample_unconditional(self, length=100, topk=5, temperature=1.0):
        '''
        Sample `length` words from the model.
        Code strongly inspired by
        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py

        '''
        context = torch.full((1, 1),
                             self.enc.encoder[self.start_token],
                             device=self.device,
                             dtype=torch.long)
        prev = context
        output = context
        past = None
        # Forward through the model
        with torch.no_grad():
            for i in range(length):
                logits, past = self.model(prev, past=past)
                logits = logits[:, -1, :] / temperature
                # Filter predictions to topk and softmax
                probs = torch.softmax(top_k_logits(logits, k=topk),
                                      dim=-1)
                # Sample
                prev = torch.multinomial(probs, num_samples=1)
                # Construct output
                output = torch.cat((output, prev), dim=1)

        output_text = self.enc.decode(output[0].tolist())
        return output_text

    def postprocess(self, token):
        with_space = False
        with_break = False
        if token.startswith('Ġ'):
            with_space = True
            token = token[1:]
            # print(token)
        elif token.startswith('â'):
            token = ' '
        elif token.startswith('Ċ'):
            token = ' '
            with_break = True

        token = '-' if token.startswith('â') else token
        token = '“' if token.startswith('ľ') else token
        token = '”' if token.startswith('Ŀ') else token
        token = "'" if token.startswith('Ļ') else token

        if with_space:
            token = '\u0120' + token
        if with_break:
            token = '\u010A' + token

        return token

In [None]:
from statistics import mean

In [None]:
def plot_text(vals, what, name):
    if what=="prob":
        ourvals = vals[0]
        x = list(range(1,len(ourvals)+1))
        y = ourvals
        plt.plot(x, y, color='orange')
        plt.ylim(0,1)
        plt.savefig(name + ".png")
        # plt.show()
    elif what=="rank":
        ourvals = vals[1]
        x = list(range(1, len(ourvals) + 1))
        y = ourvals
        plt.plot(x, y, color='orange')
        plt.ylim(-1000, 50000)
        plt.savefig(name + ".png")
        # plt.show()
def main_code(raw_text):

    lm = LM()
    # start = time.time()
    payload = lm.check_probabilities(raw_text, topk=5)
    # print(payload["pred_topk"])
    real_topK = payload["real_topk"]
    ranks = [i[0] for i in real_topK]
    preds = [i[1] for i in real_topK]
    # print(mean(ranks))
    # print(mean(preds))
    # plot_text([preds, ranks], 'rank', "rank_")
    # end = time.time()
    # print("{:.2f} Seconds for a check with GPT-2".format(end - start))
    return mean(preds)

In [None]:
main_code(df_model['tidy_text'][75000])

0.16807669404517453

In [None]:
test = df_model[:500]
test

Unnamed: 0,id,ended,length,type,label,tidy_text,token_words
0,47018,True,102,train,real,The overarching quality the Bloomberg era was ...,"['The', 'overarching', 'quality', 'Bloomberg',..."
1,94338,True,142,train,real,This about bad morning commute gets all winter...,"['This', 'bad', 'morning', 'commute', 'gets', ..."
2,44507,True,631,train,real,More Saskatchewan Liquor Privatization latest ...,"['More', 'Saskatchewan', 'Liquor', 'Privatizat..."
3,163493,False,1024,train,real,Killen was here Patrick Killen helped define t...,"['Killen', 'Patrick', 'Killen', 'helped', 'def..."
4,91925,True,51,train,real,Mailbox Rental Mail Forwarding Services Delawa...,"['Mailbox', 'Rental', 'Mail', 'Forwarding', 'S..."
...,...,...,...,...,...,...,...
495,73877,True,659,train,real,Old baseball equipment file photo Photo Jupite...,"['Old', 'baseball', 'equipment', 'file', 'phot..."
496,124016,True,422,train,real,Filming the untitled fourth Avengers film curr...,"['Filming', 'untitled', 'fourth', 'Avengers', ..."
497,108272,False,1024,train,real,Frank Tanana Careeer Stats What does this mean...,"['Frank', 'Tanana', 'Careeer', 'Stats', 'What'..."
498,117827,False,1024,train,real,Turns out wonky website and warp speed policy ...,"['Turns', 'wonky', 'website', 'warp', 'speed',..."


In [None]:
pred = test.apply(lambda row: main_code(row['tidy_text']), axis=1)
pred

0      0.029103
1      0.062097
2      0.080715
3      0.103187
4      0.082092
         ...   
495    0.099868
496    0.235695
497    0.150966
498    0.106068
499    0.080466
Length: 500, dtype: float64