# Detection of Fake News

## Part 1: Understanding the data 

In [14]:
import numpy as np
import os
import pandas as pd 
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from string import digits
os.chdir(r"E:\Masaüstü\bbm409-assignment2") 

In [3]:
dft=pd.read_csv("fake_news_train.csv")
dft = dft.dropna(subset=['title'])

In [4]:
#counting most weighted words in the titles
tv = TfidfVectorizer(stop_words='english',smooth_idf=True)
cv=CountVectorizer(stop_words='english')
trueDocuments=[dft["title"][i] for i in dft.index if dft["label"][i]==0]
falseDocuments=[dft["title"][i] for i in dft.index if dft["label"][i]==1]
trueDocuments="".join(x for x in trueDocuments)
falseDocuments="".join(x for x in falseDocuments)
tv_score = tv.fit_transform([trueDocuments, falseDocuments]).toarray().tolist()
cv_score = cv.fit_transform([trueDocuments, falseDocuments]).toarray().tolist()
sortedTrue=sorted(range(len(tv_score[0])), key=lambda k: tv_score[0][k],reverse=True)
sortedFalse=sorted(range(len(tv_score[1])), key=lambda k: tv_score[1][k],reverse=True)

In [5]:
print("First 5 most weighted words for true titles")
print("true      false       word")
print("-"*41) 
for i in sortedTrue[0:5]:
    print("  {:.3f}    {:.3f}        {:}".format(math.log(tv_score[0][i]+1),
                                               math.log(tv_score[1][i]+1),cv.get_feature_names()[i]))

First 5 most weighted words for true titles
true      false       word
-----------------------------------------
  0.529    0.152        new
  0.511    0.011        york
  0.180    0.420        trump
  0.038    0.090        donald
  0.032    0.000        timestrump


new , york and timestrump words are the most repetitive words in true news titles

In [6]:
print("First 5 most weighted words for fake titles")
print("false      true       word")
print("-"*41) 
for i in sortedFalse[0:5]:
    print("  {:.3f}    {:.3f}        {:}".format(math.log(tv_score[1][i]+1),
                                               math.log(tv_score[0][i]+1),cv.get_feature_names()[i]))

First 5 most weighted words for fake titles
false      true       word
-----------------------------------------
  0.420    0.180        trump
  0.317    0.015        hillary
  0.303    0.029        clinton
  0.152    0.529        new
  0.148    0.012        election


trump , hillary clinton and election words are the most repetitive words in fake news titles

## Part 2: Implementing Naive Bayes

In [21]:
dft=pd.read_csv("fake_news_train.csv")
dft = dft.dropna(subset=['text'])

In [94]:
def naiveBayes(train,test,trainFeatures,testFeatures,target):
    array=[]
    arrayProb=[]
    y=len(target)
    target.index = pd.RangeIndex(len(target.index))
    V=len(trainFeatures)#number of unique words
    CT=sum([sum(train[i]) for i in target.index if target[i]==0])#true labelli dokümanlardaki kelime sayısı
    CF=sum([sum(train[i]) for i in target.index if target[i]==1])#fake labelli dokümanlardaki kelime sayısı
    numFalse=sum([target[i] for i in target.index if target[i]==1])#fake labelli döküman sayısı
    numTrue=len(target)-numFalse#real labelli döküman sayısı
    for i in range(len(trainFeatures)):
        sumF=0
        sumR=0
        for j in target.index:
            if(target[j]==1):
                sumF+=train[j][i]#kelimenin fake labelli dokumanlarda kaç kez geçtiği
            else:
                sumR+=train[j][i]#kelimenin true labelli dokumanlarda kaç kez geçtiği
            
        array.append([math.log(((sumR+1)/(CT+V))),math.log(((sumF+1)/(CF+V)))])
    test=csr_matrix(test)
    
    for i in set(csr_matrix.nonzero(test)[0]):
        l=0
        probForTrue=1
        probForFalse=1
        probForTrue=np.float64(probForTrue)
        probForFalse=np.float64(probForFalse)
        for y in csr_matrix.nonzero(test)[1]:
            if(l==i):
                if(testFeatures[y] in trainFeatures):
                        index=trainFeatures.index(testFeatures[y])
                        probForTrue+=(array[index][0])**test[i,y]
                        probForFalse+=(array[index][1])**test[i,y]
                else:  
                    probForTrue+=math.log(((1)/(CT+V)))
                    probForFalse+=math.log(((1)/(CF+V)))
            l+=1        
        probForTrue+=math.log((numTrue/y))
        probForFalse+=math.log((numFalse/y))
        if(probForTrue>probForFalse):
            arrayProb.append(0)
        else:
            arrayProb.append(1)                                
       
       
    return arrayProb

In [43]:
#removing digits and stop words,transform to data a feature matrix
#This method for unigram
def testDataConverter(train,test):
    remove_digits = str.maketrans('', '', digits)
    train=[(str(s)).translate(remove_digits) for s in train]
    test=[(str(s)).translate(remove_digits) for s in test]
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(train).toarray()
    trainFeatures=vectorizer.get_feature_names()
    y = vectorizer.fit_transform(test).toarray()
    testFeatures=vectorizer.get_feature_names()
    return X,y,trainFeatures,testFeatures

In [112]:
#This method for biagram
def testDataConverterBia(train,test):
    remove_digits = str.maketrans('', '', digits)
    train=[(str(s)).translate(remove_digits) for s in train]
    test=[(str(s)).translate(remove_digits) for s in test]
    vectorizer = CountVectorizer(stop_words='english',ngram_range=(2, 2))
    X = vectorizer.fit_transform(train).toarray()
    trainFeatures=vectorizer.get_feature_names()
    y = vectorizer.fit_transform(test).toarray()
    testFeatures=vectorizer.get_feature_names()
    return X,y,trainFeatures,testFeatures

In [124]:
def accuracy(target,predicted):
    cm=confusion_matrix(target, predicted)
    accuracyScore=100*((cm[0][0]+cm[1][1])/len(target))
    return accuracyScore

In [106]:
#NOTE:due to time and memory issues data is sliced
X = dft["text"][0:500]
y=dft["label"][0:500]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [107]:
X,y,trainFeatures,testFeatures=testDataConverter(X_train,X_test)

In [108]:
#unigram
s=naiveBayes(X,y,trainFeatures,testFeatures,y_train)

In [110]:
#example unigram's feature matrix
trainFeatures[0:20]

['_____',
 '________',
 '___krongard',
 '_alvin',
 '_native_life',
 'aa',
 'aaa',
 'aabenraa',
 'aaron',
 'aaronkleinshow',
 'ab',
 'aback',
 'abacus',
 'abadi',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abanico',
 'abans']

In [113]:
#biagram
X = dft["text"][0:500]
y=dft["label"][0:500]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X,y,trainFeatures,testFeatures=testDataConverterBia(X_train,X_test)
b=naiveBayes(X,y,trainFeatures,testFeatures,y_train)


In [123]:
#example biagram's feature matrix
trainFeatures[50:70]

['abandon settlers',
 'abandon ship',
 'abandon troubled',
 'abandoned according',
 'abandoned base',
 'abandoned bases',
 'abandoned collapse',
 'abandoned farmhouses',
 'abandoned headquarters',
 'abandoned house',
 'abandoned ideals',
 'abandoned kunduz',
 'abandoned model',
 'abandoned plan',
 'abandoned plans',
 'abandoned staff',
 'abandoned substantive',
 'abandoning manipulated',
 'abandonment orders',
 'abanico cada']

##### NOTE:unigram and biagram model approach's accuracy will be tested at part 4

## Part 3:


### (a) Analyzing eﬀect of the words on prediction 

In [127]:
#counting most weighted words in the texts
tv = TfidfVectorizer(stop_words='english',smooth_idf=True)
cv=CountVectorizer(stop_words='english')
trueDocuments=[dft["text"][i] for i in dft.index if dft["label"][i]==0]
falseDocuments=[dft["text"][i] for i in dft.index if dft["label"][i]==1]
trueDocuments="".join(x for x in trueDocuments)
falseDocuments="".join(x for x in falseDocuments)
tv_score = tv.fit_transform([trueDocuments, falseDocuments]).toarray().tolist()
cv_score = cv.fit_transform([trueDocuments, falseDocuments]).toarray().tolist()
sortedTrue=sorted(range(len(tv_score[0])), key=lambda k: tv_score[0][k],reverse=True)
sortedFalse=sorted(range(len(tv_score[1])), key=lambda k: tv_score[1][k],reverse=True)

In [128]:
print("First 20 most weighted words in the true news")
print("true      false       word")
print("-"*41) 
for i in sortedTrue[0:20]:
    print("  {:.3f}    {:.3f}        {:}".format(math.log(tv_score[0][i]+1),
                                               math.log(tv_score[1][i]+1),cv.get_feature_names()[i]))

First 20 most weighted words in the true news
true      false       word
-----------------------------------------
  0.406    0.146        said
  0.387    0.022        mr
  0.233    0.237        trump
  0.137    0.199        people
  0.134    0.127        new
  0.116    0.106        president
  0.111    0.127        like
  0.087    0.003        ms
  0.084    0.116        time
  0.081    0.131        just
  0.080    0.082        years
  0.080    0.113        state
  0.077    0.084        states
  0.072    0.068        year
  0.072    0.063        united
  0.067    0.070        news
  0.066    0.062        did
  0.064    0.088        american
  0.061    0.103        government
  0.059    0.049        house


In [129]:
print("First 20 most weighted words for fake news")
print("false      true       word")
print("-"*41) 
for i in sortedFalse[0:20]:
    print("  {:.3f}    {:.3f}        {:}".format(math.log(tv_score[1][i]+1),
                                               math.log(tv_score[0][i]+1),cv.get_feature_names()[i]))

First 20 most weighted words for fake news
false      true       word
-----------------------------------------
  0.237    0.233        trump
  0.222    0.056        clinton
  0.199    0.137        people
  0.157    0.015        hillary
  0.146    0.406        said
  0.131    0.081        just
  0.127    0.134        new
  0.127    0.111        like
  0.117    0.049        world
  0.116    0.084        time
  0.113    0.080        state
  0.107    0.018        2016
  0.106    0.116        president
  0.103    0.061        government
  0.103    0.032        election
  0.096    0.052        obama
  0.096    0.025        war
  0.088    0.064        american
  0.084    0.077        states
  0.083    0.026        russia


• List some words whose presence most strongly predicts that the news is real and
whose absence most strongly predicts that the news is fake. -> said,mr,ms,house,new,president

• List some words whose absence most strongly predicts that the news is real and
whose presence most strongly predicts that the news is fake. -> russia,war,obama,election,2016,world,hillary clinton,government,time

### (b) Stopwords 

In [118]:
#above algoritm eliminate stop words below algoritm do not
tv = TfidfVectorizer(smooth_idf=True)
cv=CountVectorizer()
trueDocuments=[dft["text"][i] for i in dft.index if dft["label"][i]==0]
falseDocuments=[dft["text"][i] for i in dft.index if dft["label"][i]==1]
trueDocuments="".join(x for x in trueDocuments)
falseDocuments="".join(x for x in falseDocuments)
tv_score = tv.fit_transform([trueDocuments, falseDocuments]).toarray().tolist()
cv_score = cv.fit_transform([trueDocuments, falseDocuments]).toarray().tolist()
sortedTrue=sorted(range(len(tv_score[0])), key=lambda k: tv_score[0][k],reverse=True)
sortedFalse=sorted(range(len(tv_score[1])), key=lambda k: tv_score[1][k],reverse=True)

In [119]:
print("First 10 most weighted words in the true news (include stopwords)")
print("true      false       word")
print("-"*41) 
for i in sortedTrue[0:10]:
    print("  {:.3f}    {:.3f}        {:}".format(math.log(tv_score[0][i]+1),
                                               math.log(tv_score[1][i]+1),cv.get_feature_names()[i]))

First 10 most weighted words in the true news (include stopwords)
true      false       word
-----------------------------------------
  0.525    0.521        the
  0.274    0.281        to
  0.266    0.280        of
  0.245    0.262        and
  0.218    0.197        in
  0.152    0.150        that
  0.105    0.100        for
  0.101    0.083        on
  0.093    0.046        he
  0.091    0.137        is


In [120]:
print("First 10 most weighted words for fake news (include stopwords)")
print("false      true       word")
print("-"*41) 
for i in sortedFalse[0:10]:
    print("  {:.3f}    {:.3f}        {:}".format(math.log(tv_score[1][i]+1),
                                               math.log(tv_score[0][i]+1),cv.get_feature_names()[i]))

First 10 most weighted words for fake news (include stopwords)
false      true       word
-----------------------------------------
  0.521    0.525        the
  0.281    0.274        to
  0.280    0.266        of
  0.262    0.245        and
  0.197    0.218        in
  0.150    0.152        that
  0.137    0.091        is
  0.100    0.105        for
  0.093    0.087        it
  0.083    0.101        on


### (c) Analyzing eﬀect of the stopwords


As it is seen, eliminating stopwords from text is critic.Stopwords are meaningless to predict data and they are waste of time and memory for program due to repetiveness.

### Part 4: Calculation of Accuracy 

In [125]:
print("Accuracy of unigram : ",accuracy(y_test,s))
print("Accuracy of biagram : ",accuracy(y_test,b))

Accuracy of unigram :  49.0
Accuracy of biagram :  51.0


As it is seen biagram model is slightly more successfull to predict data

## Conclusion

According to analyse and test result preprocessing data is critic (e.g:eliminating stop words),
biagram approuch can be more proper for this dataset and bigger datasets can predict data more accurate
(due to the time and memory issues, a small amount of data is used in this report therefore accuracies are low.)
