In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import joblib

import spacy
import contractions
import string
import re
from bs4 import BeautifulSoup
from unidecode import unidecode
import nltk
import swifter
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

from sklearn.pipeline import Pipeline
from transformers import AutoTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix , classification_report, ConfusionMatrixDisplay
from multiprocessing import Pool
from sklearn.feature_extraction.text import HashingVectorizer


from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV

In [2]:
news_df = pd.read_csv('LIAR.csv')

In [3]:
news_df.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Speaker Job Title,State Info,Party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,context
0,2635.json,FALSE,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,FALSE,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [4]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12789 entries, 0 to 12788
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    12789 non-null  object 
 1   Label                 12789 non-null  object 
 2   Statement             12789 non-null  object 
 3   Subject               12787 non-null  object 
 4   Speaker               12787 non-null  object 
 5   Speaker Job Title     9222 non-null   object 
 6   State Info            10040 non-null  object 
 7   Party                 12787 non-null  object 
 8   barely true counts    12787 non-null  float64
 9   false counts          12787 non-null  float64
 10  half true counts      12787 non-null  float64
 11  mostly true counts    12787 non-null  float64
 12  pants on fire counts  12787 non-null  float64
 13  context               12658 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.4+ MB


In [5]:
news_df['text'] = news_df['Subject'] + " " +news_df['Speaker'] + " " +news_df['Statement'] + " " +news_df['context'] #context being tested
news_df = news_df.drop(['Subject', 'Speaker', 'Statement', 'context','ID', 'Speaker Job Title', 'State Info', 'Party', 'barely true counts', 'false counts',
         'half true counts', 'mostly true counts', 'pants on fire counts'], axis=1)

In [6]:
news_df.info()
news_df = news_df.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12789 entries, 0 to 12788
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   12789 non-null  object
 1   text    12658 non-null  object
dtypes: object(2)
memory usage: 200.0+ KB


In [7]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12658 entries, 0 to 12788
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   12658 non-null  object
 1   text    12658 non-null  object
dtypes: object(2)
memory usage: 296.7+ KB


In [8]:
#Remove rows with blank space instead of text
blank_rows = news_df[news_df['text'].str.strip() == ''].index
news_df = news_df.drop(blank_rows)
news_df = news_df.reset_index(drop=True)

#Unicode to ascii if exists
def replace_foreign_chars(text):
    return unidecode(text)

news_df['text'] = news_df['text'].apply(replace_foreign_chars)

#Constractions handling
def expand_contractions(text):
    return contractions.fix(text)

news_df['text'] = news_df['text'].apply(expand_contractions)

#Convert text to lowercase text
def lowercase_text(text):
    return " ".join(x.lower() for x in text.split() )

news_df['text'] = news_df['text'].apply(lowercase_text)

#Remove URLs and special Characters
def remove_urls(text):
    cleaned_text = re.sub(r'http\S+', '', text)
    return cleaned_text

def remove_special_characters(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

news_df['text'] = news_df['text'].apply(remove_urls)
news_df['text'] = news_df['text'].apply(remove_special_characters)

#Remove html tags func
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    return cleaned_text

news_df['text'] = news_df['text'].apply(remove_html_tags)

#Delete numbers from dataset
news_df["text"] = news_df["text"].str.replace('\d','')

#Punctuations hadnling
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translator)
    return text_without_punctuation

news_df['text'] = news_df['text'].apply(remove_punctuation)

In [14]:
news_df['Label'].value_counts()

0    4457
1    3512
Name: Label, dtype: int64

In [10]:
values = ['half-true', 'barely-true']

news_df = news_df[news_df.Label.isin(values) == False]

In [12]:
news_df['Label'] = news_df['Label'].replace({'FALSE' : 1, 'pants-fire' : 1, 'mostly-true' : 0, 'TRUE' : 0})


In [15]:
news_df.to_csv('LIAR_preprocessed_nolemma.csv', index=False)

In [13]:
news_df.head()

Unnamed: 0,Label,text
0,1,abortion dwaynebohac says the annies list poli...
2,0,foreignpolicy barackobama hillary clinton agre...
3,1,healthcare blogposting health care reform legi...
5,0,education robinvos the chicago bears have had ...
9,0,energymessagemachinevotingrecord dueystroebel ...


In [16]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7969 entries, 0 to 12656
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   7969 non-null   int64 
 1   text    7969 non-null   object
dtypes: int64(1), object(1)
memory usage: 186.8+ KB
