In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import cmudict
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
nltk.download('tagsets')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

nltk_tags = ["''",'(',')',',','--','.',':','CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD'
             ,'NN','NNP','NNPS','NNS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','SYM','TO','UH',
             'VB','VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP$','WRB']
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
def clean_text(text):
    if text:
        text = f'{text}'
        #text = text.lower()
        text = text.split()
        ps = PorterStemmer()
        text = [ps.stem(word) for word in text if text not in stopwords.words('english')]
        text = ' '.join(text)
        return text

In [None]:
def count_uppercase(text):
    result = 0
    for char in text:
        if char.isupper():
            result += 1 
            
    return result

In [None]:
def count_stopwords(text):
    cnt = 0
    for word in text.split(' '):
        if word in stopwords.words('english'):
            cnt += 1
            
    return cnt

In [None]:
def tag_text(text):
    text = nltk.word_tokenize(text)
    return nltk.pos_tag(text)

In [None]:
def feature_selection_text(text):
    import textstat
    result = []
    #Syntax-based features 
    result.append(len(text)) #count chars
    result.append(len(text.split(' '))) #count words
    result.append(count_stopwords(text)) #count stopwords
    result.append(count_uppercase(text)) #Count uppercase

    
    #Grammatical evidence
    #Count tags
    tags_dict = {}
    for tag in nltk_tags:
        tags_dict[tag] = 0
    
    tags = tag_text(text)
    for _,tag in tags:
        try:
            tags_dict[tag] += 1
        except:
            continue
    
    for i in range(len(nltk_tags)):
        nltk_tag = nltk_tags[i]
        result.append(tags_dict[nltk_tag])
    

    
    #Readability features 
    result.append(textstat.flesch_reading_ease(text))
    result.append(textstat.smog_index(text))
    result.append(textstat.flesch_kincaid_grade(text))
    result.append(textstat.coleman_liau_index(text))
    result.append(textstat.automated_readability_index(text))
    result.append(textstat.linsear_write_formula(text))

    return result

In [None]:
def feature_selection_texts(arr):
    return np.array([feature_selection_text(item) for item in arr])

In [None]:
#Import dataset 5 (185 samples)
X  = []
y = []

with open("/content/gdrive/MyDrive/special_fake_news.txt",'r',encoding = 'latin-1') as file:
    texts = file.read().split('\n')
    for text in texts:
        if text:
            X.append(text)
            y.append(0)

with open("/content/gdrive/MyDrive/special_real_news.txt",'r',encoding = 'latin-1') as file:
    texts = file.read().split('\n')
    for text in texts:
        if text:
            X.append(text)
            y.append(1)

In [None]:
#Importing dataset 1 (First 5000 samples)
data = pd.read_csv("/content/gdrive/MyDrive/train.csv")

X = []
y = []
for i in range(5000):
  X.append(clean_text(data['title'][i]))
  y.append(data['label'][i])

In [None]:
#Import dataset 2 (First 5000 samples)
data = pd.read_csv("/content/gdrive/MyDrive/fake_or_real_news.csv")

X = []
y = []

for i in range(5000):
    X.append(clean_text(data['title'][i]))
    if data['label'][i] == 'FAKE':
        y.append(0)
    else:
        y.append(1)

In [None]:
#Import dataset 3
fake_data = pd.read_csv("/content/gdrive/MyDrive/Fake.csv",engine = 'python')
true_data = pd.read_csv("/content/gdrive/MyDrive/True.csv",engine = 'python')

#Take 2500 samples from true data , 4064 samples from fake data
X = []
y = []
for i in range(4064):
    X.append(clean_text(fake_data['title'][i]))
    y.append(0)
for i in range(2500):
    X.append(clean_text(true_data['title'][i]))
    y.append(1)

In [None]:
#Import dataset 4 (7950 samples)
import sys
import csv

csv.field_size_limit(sys.maxsize)
fake_data = pd.read_csv("/content/gdrive/MyDrive/Fake.csv",engine = 'python')
fake_data_2 = pd.read_csv("/content/gdrive/MyDrive/fake2.csv",engine = 'python')
true_data = pd.read_csv("/content/gdrive/MyDrive/True.csv",engine = 'python')


X = []
y = []
for i in range(1620):
    X.append(clean_text(fake_data['title'][i]))
    y.append(0)
for i in range(2880):
    X.append(clean_text(fake_data_2['title'][i]))
    y.append(0)
for i in range(3450):
    X.append(clean_text(true_data['title'][i]))
    y.append(1)

In [None]:
#Import dataset 6 
data = pd.read_csv("/content/gdrive/MyDrive/data.csv")
X = []
y = data['Label']
for i in range(len(data)):
    X.append(clean_text(data['Headline'][i]))

In [None]:
#Import dataset 7 
X = []
y = []

#Get 2000 samples from dataset 6 
data = pd.read_csv("/content/gdrive/MyDrive/data.csv")
for i in range(2000):
    X.append(clean_text(data['Headline'][i]))
    y.append(data['Label'][i])

#Get all from dataset 5
with open("/content/gdrive/MyDrive/special_fake_news.txt",'r',encoding = 'latin-1') as file:
    texts = file.read().split('\n')
    for text in texts:
        if text:
            X.append(clean_text(text))
            y.append(0)

with open("/content/gdrive/MyDrive/special_real_news.txt",'r',encoding = 'latin-1') as file:
    texts = file.read().split('\n')
    for text in texts:
        if text:
            X.append(clean_text(text))
            y.append(1)

#Get 2000 samples from dataset 1
data = pd.read_csv("/content/gdrive/MyDrive/train.csv")
for i in range(5000):
  X.append(clean_text(data['title'][i]))
  y.append(data['label'][i])

#Get 3000 samples from dataset 3
fake_data = pd.read_csv("/content/gdrive/MyDrive/Fake.csv",engine = 'python')
true_data = pd.read_csv("/content/gdrive/MyDrive/True.csv",engine = 'python')

#Take 1100 samples from true data , 1900 samples from fake data
for i in range(1100):
    X.append(clean_text(fake_data['title'][i]))
    y.append(0)
for i in range(1900):
    X.append(clean_text(true_data['title'][i]))
    y.append(1)

In [None]:
#Split test set
from sklearn.model_selection import train_test_split 
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size = 0.30,random_state = 0)

In [None]:
pip install textstat

Collecting textstat
[?25l  Downloading https://files.pythonhosted.org/packages/ce/42/3e3691ff23d2f3e30ef18bd382d9450e992e2da7e01ca33d392b473eba05/textstat-0.7.1-py3-none-any.whl (99kB)
[K     |████████████████████████████████| 102kB 4.3MB/s 
[?25hCollecting pyphen
[?25l  Downloading https://files.pythonhosted.org/packages/7c/5a/5bc036e01389bc6a6667a932bac3e388de6e7fa5777a6ff50e652f60ec79/Pyphen-0.10.0-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 30.0MB/s 
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.10.0 textstat-0.7.1


In [None]:
#Use multinomial NB to fit model 
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
classifier = Pipeline([
    ('features', FeatureUnion([
        ('count_vectorizer', Pipeline([
            ('vectorizer', CountVectorizer()),
        ])),
        ('more', Pipeline([
            ('linguistic_features', FunctionTransformer(feature_selection_texts, validate=False)),
        ]))
    ])),
    ('to_dense',DenseTransformer()),
    ('scale',MinMaxScaler()),
    ('nb',VotingClassifier(estimators = [('naive_bayes',MultinomialNB()),
                                            ('logistic_regression',LogisticRegression(n_jobs = -1 , random_state = 0)),
                                            #('knn',KNeighborsClassifier(n_neighbors=10,n_jobs = -1))
                                            #('svm',SVC(random_state = 0))
                                            ],
                                            voting = 'hard',
                                            n_jobs = -1) )])

classifier = classifier.fit(X_train , y_train)
y_pred = classifier.predict(X_test) 

In [None]:
#Making confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test , y_pred)

In [None]:
def calculate_accuracy(cm):
    tn = cm[0][0]
    tp = cm[1][1]
    fp = cm[0][1]
    fn = cm[1][0]
        
    accuracy = (tn + tp) / (tn + tp + fp + fn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = 2*(precision*recall)/(precision + recall)
    average = (accuracy + precision + recall + f1) /4 
    return [accuracy , precision , recall , f1 , average]

In [None]:
acc = calculate_accuracy(cm)

In [None]:
acc

[0.814680710994075,
 0.8635714285714285,
 0.7647058823529411,
 0.8111372022811137,
 0.8135238060498896]

In [None]:
len(X)

10125