In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from readability import Readability
from transformers import pipeline
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
real = pd.read_csv('data/True.csv')

In [3]:
real["label"] = 0

In [4]:
fake = pd.read_csv('data/Fake.csv')

In [5]:
fake["label"] = 1

In [6]:
data_df = pd.concat([real, fake], ignore_index=True).drop(['title','subject','date'],axis=1)

In [7]:
data_df.count()

text     44898
label    44898
dtype: int64

In [8]:
def preprocess(df):
    df['text'].astype(str)
    df['clean_text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False) #Removing urls
    #df['clean_text'] = df['clean_text'].str.replace('[^' + string.printable + ']', '')
    df['clean_text'] = df['clean_text'].str.replace('[^A-Za-z0-9]+', ' ') # Removing Punctuations, Numbers, and Special Character 
    df['clean_text'] = df['clean_text'].str.replace('\s+', ' ')
    df['clean_text'] = df['clean_text'].map(lambda x: x if type(x)!=str else x.strip().lower()) #lowercase
    df['clean_text'].dropna(inplace=True) #drop NaN values
    return df


In [9]:
clean_df = preprocess(data_df)

  df['clean_text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False) #Removing urls
  df['clean_text'] = df['clean_text'].str.replace('[^A-Za-z0-9]+', ' ') # Removing Punctuations, Numbers, and Special Character
  df['clean_text'] = df['clean_text'].str.replace('\s+', ' ')


In [10]:
stopwords = nltk.corpus.stopwords.words("english")

def remove_stopwords(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [word for word in token_text if word not in stopwords]
    join_text = ' '.join(remove_stop)
    return join_text

In [11]:
clean_df['clean_text'] = clean_df.clean_text.apply(lambda x : remove_stopwords(x))

In [12]:
clean_df.count()

text          44898
label         44898
clean_text    44898
dtype: int64

In [13]:
sentiment_pipeline = pipeline("sentiment-analysis")
block_size = 350

def analyze_sentiment(text_to_analyze):
    words = text_to_analyze.split()
    txt_blocks = []
    if len(words) > block_size:
        while len(words) > block_size:
            txt_blocks.append(" ".join(words[:block_size]))
            words = words[block_size:]
        # last block - just adding it if it has decent amount of tokens
        if len(words) > 250:
            txt_blocks.append(" ".join(words))
    else:
        txt_blocks.append(" ".join(words))
    
    first_sent_res = None
    overall_sent = 0
    for i, blk in enumerate(txt_blocks):
        try:
            sentiment_res = sentiment_pipeline([blk])
        except:
            print(len(blk.split()))
            continue
    
        sent = sentiment_res[0].get("label").lower()
        if sent == "positive":
            overall_sent += 1
            if first_sent_res is None:
                first_sent_res = 1
        elif sent == "negative":
            overall_sent -= 1
            if first_sent_res is None:
                first_sent_res = 0
        else:
            raise RuntimeError(f"unknown label from sentiment analyzer: {sent}")
    
    if overall_sent > 0:
        return 1
    elif overall_sent < 0:
        return 0
    else:
        return first_sent_res

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
No CUDA runtime is found, using CUDA_HOME='C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0'
    PyTorch 2.0.1+cu118 with CUDA 1108 (you have 2.0.1+cpu)
    Python  3.10.11 (you have 3.10.4)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [14]:
def generate_feat(input_df):
    out_df = pd.DataFrame()
    skip_pos_tags = ['``',':', '.', "''", '$', 'NNPS', 'WDT', 'RBS', 'WP', 'POS', 'UH', 'WRB', 'EX', 'PRP$', 'TO', 'SYM', 'PDT', 'RP', 'CC', 'FW', 'PRP', 'WP$', 'NNP', 'DT', 'RBR']
    skip_pos_tags_dict = {}
    for tag in skip_pos_tags:
        skip_pos_tags_dict[tag] = True
    
    for i in range(len(input_df)):
        print(f"{i} / {len(input_df)}", end='\r')
        clean_txt = input_df.loc[i, "clean_text"]
        out_df.loc[i, "text"] = clean_txt
        out_df.loc[i, "label"] = input_df.loc[i, "label"]
        out_df.loc[i, "token_length"] = len(clean_txt.split())
        
        word_pos_tags = nltk.pos_tag(word_tokenize(clean_txt))
        for word in word_pos_tags:
            tag = "WTAG_" + word[1]
            if skip_pos_tags_dict.get(word[1]):
                continue
            try:
                if np.isnan(out_df.loc[i, tag]): 
                    out_df.loc[i, tag] = 1
                else:
                    out_df.loc[i, tag] += 1
            except KeyError:
                out_df.loc[i, tag] = 1
        
        if len(clean_txt.split()) > 100:
            r = Readability(clean_txt)
            out_df.loc[i, "fk_score"] = r.flesch_kincaid().score
            out_df.loc[i, "flesh_score"] = r.flesch().score
            out_df.loc[i, "ari_score"] = r.ari().score
            out_df.loc[i, "cl_score"] = r.coleman_liau().score
            out_df.loc[i, "gf_score"] = r.gunning_fog().score
        else:
            out_df.loc[i, "fk_score"] = \
            out_df.loc[i, "flesh_score"] = \
            out_df.loc[i, "ari_score"] = \
            out_df.loc[i, "cl_score"] = \
            out_df.loc[i, "gf_score"] = 0
        
        out_df.loc[i, "sentiment"] = analyze_sentiment(str(clean_txt))

    return out_df

In [15]:
feat_df = generate_feat(clean_df)

21487 / 44898

Token indices sequence length is longer than the specified maximum sequence length for this model (526 > 512). Running this sequence through the model will result in indexing errors


35088 / 44898
35096 / 44898
35004 / 44898
35088 / 44898
35005 / 44898
35018 / 44898
35046 / 44898
35068 / 44898
35049 / 44898
35093 / 44898
34613 / 44898
35022 / 44898
35062 / 44898
35045 / 44898
35078 / 44898
35003 / 44898
35081 / 44898
35093 / 44898
35096 / 44898
35009 / 44898
35088 / 44898
35036 / 44898
35007 / 44898
35013 / 44898
35062 / 44898
35074 / 44898
35003 / 44898
35020 / 44898
35021 / 44898
35059 / 44898
35073 / 44898
35007 / 44898
35016 / 44898
35042 / 44898
35045 / 44898
32946 / 44898
35056 / 44898
35074 / 44898
35023 / 44898
35095 / 44898
35041 / 44898
35076 / 44898
35099 / 44898
35019 / 44898
337
35030 / 44898
31533 / 44898
33138 / 44898
32245 / 44898
34274 / 44898
35006 / 44898
35013 / 44898
35016 / 44898
35023 / 44898
35033 / 44898
35040 / 44898
35079 / 44898
35085 / 44898
35098 / 44898
35002 / 44898
34724 / 44898
35033 / 44898
35036 / 44898
35051 / 44898
35067 / 44898
35070 / 44898
35082 / 44898
35093 / 44898
34198 / 44898
35093 / 44898
35094 / 44898
35087 / 44898
34

In [16]:
feat_df.to_csv('data/feat_new.csv', index=False)