In [0]:
import pandas as pd
import csv
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import seaborn as sb


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk.corpus 
from nltk.tokenize import word_tokenize
from gensim.models.word2vec import Word2Vec
nltk.download('treebank')

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!unzip -uq "drive/My Drive/PFA/PFA.zip" -d "drive/My Drive/PFA"

In [0]:
test_filename = 'drive/My Drive/PFA/test.csv'
train_filename = 'drive/My Drive/PFA/train.csv'
valid_filename = 'drive/My Drive/PFA/valid.csv'

train_news = pd.read_csv(train_filename)
test_news = pd.read_csv(test_filename)
valid_news = pd.read_csv(valid_filename)

In [0]:
train_news.drop(train_news.filter(regex="Unname"),axis=1, inplace=True)
test_news.drop(test_news.filter(regex="Unname"),axis=1, inplace=True)
valid_news.drop(valid_news.filter(regex="Unname"),axis=1, inplace=True)

In [0]:
#data observation
def data_obs():
    print("training dataset size:")
    print(train_news.shape)
    print(train_news.head(10))

    #below dataset were used for testing and validation purposes
    print(test_news.shape)
    print(test_news.head(10))
    
    print(valid_news.shape)
    print(valid_news.head(10))

#check the data by calling below function
data_obs()

In [0]:
#distribution of classes for prediction
def create_distribution(dataFile):
    
    return sb.countplot(x='label', data=dataFile, palette='hls')
    

#by calling below we can see that training, test and valid data seems to be failry evenly distributed between the classes
create_distribution(train_news)
create_distribution(test_news)
create_distribution(valid_news)


In [0]:
def data_qualityCheck():
    
    print("Checking data qualitites...")
    train_news.isnull().sum()
    train_news.info()
        
    print("check finished.")

    #below datasets were used to 
    test_news.isnull().sum()
    test_news.info()

    valid_news.isnull().sum()
    valid_news.info()

#run the below function call to see the quality check results

data_qualityCheck()

In [0]:
import nltk
nltk.download('stopwords')

eng_stemmer = SnowballStemmer('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

#Stemming
def stem_tokens(tokens, stemmer):
    stemmed = []
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    return stemmed

#process the data
def process_data(data,exclude_stopword=True,stem=True):
    tokens = [w.lower() for w in data]
    tokens_stemmed = tokens
    tokens_stemmed = stem_tokens(tokens, eng_stemmer)
    tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ]
    return tokens_stemmed

In [0]:
#creating ngrams
#unigram 
def create_unigram(words):
    assert type(words) == list
    return words

#bigram
def create_bigrams(words):
    assert type(words) == list
    skip = 0
    join_str = " "
    Len = len(words)
    if Len > 1:
        lst = []
        for i in range(Len-1):
            for k in range(1,skip+2):
                if i+k < Len:
                    lst.append(join_str.join([words[i],words[i+k]]))
    else:
        #set it as unigram
        lst = create_unigram(words)
    return lst

In [0]:
porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [0]:
for i in range (len(train_news['label'])) :
  if (train_news['label'][i]=="real"):
    train_news['label'][i]="true"
  else:
    train_news['label'][i]="false"



In [0]:
for i in range (len(test_news['label'])) :
  if (test_news['label'][i]=="real"):
    test_news['label'][i]="true"
  else:
    test_news['label'][i]="false"

In [0]:
for i in range (len(valid_news['label'])) :
  if (valid_news['label'][i]=="real"):
    valid_news['label'][i]="true"
  else:
    valid_news['label'][i]="false"

In [0]:
#we will start with simple bag of words technique 
#creating feature vector - document term matrix
countV = CountVectorizer()
train_count = countV.fit_transform(train_news['text'].values)

print(countV)
print(train_count)

In [0]:
#print training doc term matrix
#we have matrix by calling below
def get_countVectorizer_stats():
    
    #vocab size
    train_count.shape

    #check vocabulary using below command
    print(countV.vocabulary_)

    #get feature names
    print(countV.get_feature_names()[:25])

get_countVectorizer_stats()

In [0]:
tfidfV = TfidfTransformer()
train_tfidf = tfidfV.fit_transform(train_count)



In [0]:
#bag of words - with n-grams

tfidf_ngram = TfidfVectorizer(stop_words='english',ngram_range=(1,4),use_idf=True,smooth_idf=True)

In [0]:

#POS Tagging
tagged_sentences = nltk.corpus.treebank.tagged_sents()

cutoff = int(.75 * len(tagged_sentences))
training_sentences = train_news['text']
 
print(training_sentences)

In [0]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

In [0]:
#building classifier using naive bayes 
nb_pipeline = Pipeline([
        ('NBCV',countV),
        ('nb_clf',MultinomialNB())])

nb_pipeline.fit(train_news['text'],train_news['label'])
predicted_nb = nb_pipeline.predict(test_news['text'])
np.mean(predicted_nb == test_news['label'])


In [0]:
#building Linear SVM classfier
svm_pipeline = Pipeline([
        ('svmCV',countV),
        ('svm_clf',svm.LinearSVC())
        ])

svm_pipeline.fit(train_news['text'],train_news['label'])
predicted_svm = svm_pipeline.predict(test_news['text'])
np.mean(predicted_svm == test_news['label'])

In [0]:
#logistic regression classifier
logR_pipeline_ngram = Pipeline([
        ('LogR_tfidf',tfidf_ngram),
        ('LogR_clf',LogisticRegression(penalty="l2",C=1))
        ])

logR_pipeline_ngram.fit(train_news['text'],train_news['label'])
predicted_LogR_ngram = logR_pipeline_ngram.predict(test_news['text'])
np.mean(predicted_LogR_ngram == test_news['label'])

In [0]:
#saving model to the disk
model_file = 'drive/My Drive/PFA/final_model.sav'
pickle.dump(logR_pipeline_ngram,open(model_file,'wb'))

In [4]:
import pickle

from google.colab import drive
drive.mount('/content/drive')

load_model = pickle.load(open('drive/My Drive/PFA/final_model.sav', 'rb'))
#function to run for prediction
def detecting_fake_news(var):    
#retrieving the best model for prediction call
    
    prediction = load_model.predict([var])
    prob = load_model.predict_proba([var])

    return (print("The given statement is ",prediction[0]),
        print("The truth probability score is ",prob[0][1]))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
var1 = "ABUJA (Reuters) - A major Nigerian oil union suspended a nationwide strike on Monday, the same day it began, after a dispute resolution ended with a domestic oil and gas company recalling laid off staff, the union s president said. "
detecting_fake_news(var1)


The given statement is  true
The truth probability score is  0.7750991349698357


(None, None)

In [7]:

var2 = "Republicans were just given a leg up over Democrats in this fall s presidential election in the battleground state of North Carolina, and they have a judge put in place by George W. Bush to thank for it.Federal judge Thomas D. Schroeder decided in favor of Republican legislators in court on Monday, letting a controversial voter ID law stay in place despite strong objections from civil rights groups.Research on Voter ID laws have shown that these laws are often a reliable way for Republican conservatives to cut down on voters that often vote for Democrats, especially minorities and young voters.The judge, Thomas D. Schroeder of Federal District Court in Winston-Salem, wrote near the end of his 485-page opinion that  North Carolina has provided legitimate state interests for its voter ID requirement and electoral system. North Carolina s voter identification law requires people to display one of six credentials, such as a driver s license or passport, before casting a ballot. Those who cannot may complete a  reasonable impediment declaration  and cast a provisional ballot.Schroeder was officially put in place on January 8, 2008, at the beginning of George W. Bush s last year in office.The North Carolina law also banned same-day registration, cut down on the amount of days available for early voting, and stops 16 and 17-year-olds from preregistering to vote.An expert testified at the trial that the law was designed in a way to put extra burden on black and Latino voters. Republican legislators and the state s GOP governor Pat McCrory deny the claim.In 2012, a Republican Pennsylvania State House leader bragged that that state s voter ID laws would  allow Governor Romney to win the state of Pennsylvania  (he didn t), while recently a Republican congressman from Wisconsin said voter ID would make the state   which has recently voted for Democrats   competitive in the fall for Republicans.President Obama won North Carolina in 2008 by 0.32% then lost it in 2012 by 2.04%. Polling in March showed the race in North Carolina effectively a toss-up between the Democratic and Republican presidential front runners.Featured image via Flickr "

detecting_fake_news(var2)

The given statement is  false
The truth probability score is  0.36148812168892314


(None, None)

In [17]:
var = input("Please enter the news text you want to verify: ")
print("You entered: " + str(var))

detecting_fake_news(var)

Please enter the news text you want to verify: LOS ANGELES — A precipitous drop in applications for green cards, citizenship and other programs has threatened the solvency of the federal agency that administers the country’s lawful immigration system, prompting it to seek a $1.2 billion cash infusion from Congress as well as fee hikes to stay afloat.  The United States Citizenship and Immigration Services, which relies on the fees that it charges applicants to fund its operations, said that it could run out of money by the summer because the coronavirus pandemic had resulted in far fewer people applying for visas and other benefits.
You entered: LOS ANGELES — A precipitous drop in applications for green cards, citizenship and other programs has threatened the solvency of the federal agency that administers the country’s lawful immigration system, prompting it to seek a $1.2 billion cash infusion from Congress as well as fee hikes to stay afloat.  The United States Citizenship and Immig

(None, None)