In [1]:
import zipfile
import gzip
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt


%matplotlib inline
plt.rcParams['savefig.dpi'] = 144
sns.set()

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS


nlp = spacy.load('en_core_web_md')
stopwords_ = STOP_WORDS

In [3]:
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix


lemm = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')


In [4]:
import re
from string import punctuation

In [5]:
with zipfile.ZipFile('news.zip') as f:
    df = pd.read_csv(f.open('news.csv'))
#     pd.read_csv(f)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [7]:
df.title = list(nlp.pipe(df.title))

In [8]:
# df.text = list(nlp.pipe(df.text))

In [42]:
class FeatureExtraction(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        Xt = X.copy()
        
#         Xt.title = list(nlp.pipe(Xt.title))
        # Xt.text = list(nlp.pipe(Xt.text))
        
        Xt['title_PERSON'] = Xt['title'].apply(lambda x: ', '.join([ent.text for ent in x.ents if ent.label_ == 'PERSON']))
        Xt['title_GPE'] = Xt['title'].apply(lambda x: ' '.join([ent.text for ent in x.ents if ent.label_ == 'GPE']))
        
        return Xt
     

In [43]:
fx = FeatureExtraction().fit_transform(df)

In [44]:
fx.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,title_PERSON,title_GPE
0,8476,"(You, Can, Smell, Hillary, ’s, Fear)","Daniel Greenfield, a Shillman Journalism Fello...",FAKE,Hillary,
1,10294,"(Watch, The, Exact, Moment, Paul, Ryan, Commit...",Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Paul Ryan,
2,3608,"(Kerry, to, go, to, Paris, in, gesture, of, sy...",U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry,Paris
3,10142,"(Bernie, supporters, on, Twitter, erupt, in, a...","— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,
4,875,"(The, Battle, of, New, York, :, Why, This, Pri...",It's primary day in New York and front-runners...,REAL,,


In [41]:
fx.title_PER.nunique()

1257

In [None]:
df.info()

In [None]:
df.head()

In [7]:
real_pct = (len(df.loc[df.label == 'REAL'])/len(df))*100
fake_pct = (len(df.loc[df.label == 'FAKE'])/len(df))*100
print('Percentage of Real news in dataset = {}\nPercentage of Fake news in dataset = {}'.format(real_pct, fake_pct))

Percentage of Real news in dataset = 50.05524861878453
Percentage of Fake news in dataset = 49.94475138121547


In [10]:
class TextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        self.new_X = X.copy()
        
        if not isinstance(self.new_X, pd.DataFrame):
            self.new_X = pd.DataFrame(self.new_X)
        
#       Create a new feature mention, representing the names mentioned in the text denotede by any word which is preeeded by the @ symbol
        self.new_X['mention'] = self.new_X.apply(lambda x: re.findall(r'@[a-zA-Z0-9]+', x['text']), axis=1)
        self.new_X['title_mention'] = self.new_X.apply(lambda x: re.findall(r'@[a-zA-Z0-9]+', x['title']), axis=1)
    
#       Create a new feature to store the hash tags
        self.new_X['hashtag'] = self.new_X.apply(lambda x: re.findall(r'#\w+', x['text']), axis=1)
        self.new_X['hashtag'] = self.new_X.apply(lambda x: ' '.join(x['hashtag']), axis=1)
        
        self.new_X['title_hashtag'] = self.new_X.apply(lambda x: re.findall(r'#\w+', x['title']), axis=1)
        self.new_X['title_hashtag'] = self.new_X.apply(lambda x: ' '.join(x['title_hashtag']), axis=1)
        
#       Remove all hash tags from the main text data
        self.new_X['text'] = self.new_X.apply(lambda x: re.sub(r'#\w+','', x['text']), axis=1)
        self.new_X['title'] = self.new_X.apply(lambda x: re.sub(r'#\w+','', x['title']), axis=1)
    
#       Since mentions have already been collected in the mention colummc, mentions should be removed from the text data
        self.new_X['text'] = self.new_X.apply(lambda x: re.sub(r'@[a-zA-Z0-9]+','', x['text']), axis=1)
        self.new_X['title'] = self.new_X.apply(lambda x: re.sub(r'@[a-zA-Z0-9]+','', x['title']), axis=1)
    
#       remove all hyperlinks in the tweets
        self.new_X['text'] = self.new_X.apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', x['text']), axis=1)
        self.new_X['title'] = self.new_X.apply(lambda x: re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', x['title']), axis=1)
        
#         Remove punctuations from the title title and text columns
        self.new_X['title'] = self.new_X.apply(lambda x: "".join([word.lower() for word in x['title'] if word not in punctuation]), axis=1)
        self.new_X['text'] = self.new_X.apply(lambda x: "".join([word.lower() for word in x['text'] if word not in punctuation]), axis=1)
        
#       Tokenize the text data
        self.new_X['text'] = self.new_X.apply(lambda x: nltk.word_tokenize(x['text']), axis=1)
        self.new_X['title'] = self.new_X.apply(lambda x: nltk.word_tokenize(x['title']), axis=1)
    
#       Remove stop words from the text data
        self.new_X['text'] = self.new_X.apply(lambda x: [word for word in x['text'] if word not in stopwords], axis=1)
        self.new_X['title'] = self.new_X.apply(lambda x: [word for word in x['title'] if word not in stopwords], axis=1)
    
#       Lemmatize the text data
        self.new_X['text'] = self.new_X.apply(lambda x: [lemm.lemmatize(word) for word in x['text']], axis=1)
        self.new_X['text'] = self.new_X.apply(lambda x: ' '.join(x['text']), axis=1)
        
        self.new_X['title'] = self.new_X.apply(lambda x: [lemm.lemmatize(word) for word in x['title']], axis=1)
        self.new_X['title'] = self.new_X.apply(lambda x: ' '.join(x['title']), axis=1)
        
        
        return self.new_X

In [11]:
# class TTSplit(BaseEstimator, TransformerMixin):
#     def __init__(self, test_size, randon_state):
#         self.test_size = test_size
#         self.randon_state = randon_state
        
#     def fit(self, X, y):
#         self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=self.test_size, random_state=self.randon_state)
#         return self
    
#     def transform(self, X):
#         Xt = X.copy()
        
#         return self.X_train, self.X_test, self.y_train, self.y_test

In [33]:
tt = TextTransformer()
data = tt.fit_transform(df)

In [35]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
X_text = tfidf.fit_transform(data.text)
X_title = tfidf.fit_transform(data.title)


In [36]:
y = df.label
# X = X.toarray()


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

In [46]:
lr = LogisticRegression()

In [47]:
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [48]:
y_pred = lr.predict(X_test)

In [49]:
score = accuracy_score(y_test, y_pred)

In [50]:
print(score)

0.9060773480662984


In [43]:
print(score)

0.8089976322020521


In [44]:
confusion_matrix(y_test, y_pred, labels=y.unique())

array([[542,  86],
       [156, 483]], dtype=int64)

In [57]:
# lr.predict(tfidf.fit_transform(['I think i an gonna like it here']))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [31]:
transform = TextTransformer()
tfidf = TfidfVectorizer(ngram_range=(1, 2))
model = LogisticRegression()

In [32]:
pipe = Pipeline(steps=[
    ('transformer', transform),
    ('vectorizer', tfidf),
    ('model', model)
])

In [33]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('transformer', TextTransformer()), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm=...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [34]:
pred = pipe.predict(X_test)

In [35]:
pipe.predict(['I think I am going to love it here!'])

array(['FAKE'], dtype=object)

In [37]:
accuracy_score(y_test, pred)

0.9171270718232044

In [64]:
abc = AdaBoostClassifier()
bag = BaggingClassifier()
gbc = GradientBoostingClassifier()
rfc = RandomForestClassifier()
lr = LogisticRegression()

In [65]:
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [68]:
lr_pred = lr.predict(X_test)

In [69]:
abc.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [70]:
abc_pred = abc.predict(X_test)

In [72]:
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [73]:
bag_pred = bag.predict(X_test)

In [74]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [75]:
gbc_pred = gbc.predict(X_test)

In [76]:
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [77]:
rfc_pred = rfc.predict(X_test)

In [79]:
(accuracy_score(y_test, lr_pred),
accuracy_score(y_test, abc_pred), 
 accuracy_score(y_test, bag_pred), 
 accuracy_score(y_test, rfc_pred))

(0.9171270718232044,
 0.8800315706393055,
 0.8721389108129439,
 0.8413575374901342)

In [82]:
print(' Logistic Regression : {} \n AdaBoostClassifier : {} \n BaggingClassifier : {} \n GradientBoostingClassifier : {} \n RandomForestClassifier : {}'.format(accuracy_score(y_test, lr_pred),accuracy_score(y_test, abc_pred), accuracy_score(y_test, bag_pred), accuracy_score(y_test, gbc_pred), accuracy_score(y_test, rfc_pred)))

 Logistic Regression : 0.9171270718232044 
 AdaBoostClassifier : 0.8800315706393055 
 BaggingClassifier : 0.8721389108129439 
 GradientBoostingClassifier : 0.9060773480662984 
 RandomForestClassifier : 0.8413575374901342
