In [22]:
import tweepy
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Add your Bearer token here.
client = tweepy.Client(bearer_token='')

# Create Pandas dataframe with three columns
tweet_df = pd.DataFrame(columns=['text', 'language'])

# Query tweets with #news. Show only 100 tweets
query = '#news'
tweets = client.search_recent_tweets(query=query, tweet_fields=['lang'], max_results=10)

for tweet in tweets.data:
    text = tweet.text
    language = tweet.lang

    # Focus only to tweets in english
    if language == 'en':
        tmp = [text, language]
        tweet_df.loc[len(tweet_df)] = tmp

# Total amount of tweets in dataframe
print('Amount: ', len(tweet_df))

# Show first 5 lines of dataframe
tweet_df.head()

Amount:  6


Unnamed: 0,text,language
0,RT @Stephen03876276: Hard Work is The Key of S...,en
1,RT @DemonSlayerUSA: #NEWS Get ready to embark ...,en
2,RT @Stephen03876276: Hard Work is The Key of S...,en
3,RT @Stephen03876276: The #hero is here. #tshir...,en
4,RT @Stephen03876276: My #new #dog #tshirt\n\nM...,en


In [4]:
truenews = pd.read_csv("../files/True.csv")
fakenews = pd.read_csv("../files/Fake.csv")

truenews['label'] = 1
fakenews['label'] = 0

news_df = pd.concat([truenews, fakenews])
news_df = news_df.sample(frac = 1).reset_index(drop=True)
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,Syria Strike Completely Backfires Against Tru...,"In case you just woke up, the United States is...",News,"April 7, 2017",0
1,Russia already preparing military withdrawal f...,MOSCOW (Reuters) - Russian Security Council se...,worldnews,"November 30, 2017",1
2,Breaking: President Trump Pardons Sheriff Joe ...,Reuters is reporting: U.S. President Donald T...,politics,"Aug 25, 2017",0
3,WOW! DONNA BRAZILE Tells Critics Of Her Book O...,Donna Brazile might want to watch her back! Sh...,politics,"Nov 5, 2017",0
4,Robert Reich Just Ruined A Trump Supporter’s ...,You ve got to love Robert Reich. The former Se...,News,"September 19, 2016",0


In [5]:
print("Does Twitter Dataframe contain null values: ", tweet_df.isnull().values.any())
print("Does News Dataframe contain null values: ", news_df.isnull().values.any(), "\n")

print("Twitter dataframe datatypes: \n", tweet_df.dtypes, "\n")
print("News dataframe datatypes: \n", news_df.dtypes)

Does Twitter Dataframe contain null values:  False
Does News Dataframe contain null values:  False 

Twitter dataframe datatypes: 
 text        object
language    object
dtype: object 

News dataframe datatypes: 
 title      object
text       object
subject    object
date       object
label       int64
dtype: object


In [6]:
def deEmojify(text):
    regex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regex_pattern.sub(r'', text)

def clean_text(tweet):
    # Lower case text
    tweet = tweet.lower()

    # Remove mentions
    tweet = re.sub("@[A-Za-z0-9]+","", tweet)

    # Remove links
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)

    # Remove hastags
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)

    # Remove punctuations
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)

    # Remove emoticons
    tweet = deEmojify(tweet)

    # Filter non-alphanumeric characters (double check...)
    tweet = re.sub("[^a-z0-9]"," ", tweet)

    # Tokenize tweet (split...)
    tokens = word_tokenize(tweet)

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]

    cleaned_tweet = ' '.join(tokens)

    return cleaned_tweet

test_text = "The @SpaceX Dragon Endeavour with four @Axiom_Space astronauts is holding 20 meters away from the station as the station crew works a video routing issue. https://nasa.gov/live"
print("Before: ", test_text)
print("After: ", clean_text(test_text))


Before:  The @SpaceX Dragon Endeavour with four @Axiom_Space astronauts is holding 20 meters away from the station as the station crew works a video routing issue. https://nasa.gov/live
After:  dragon endeavour four space astronauts hold 20 meter away station station crew work video rout issue


In [16]:
news_df['clean_text'] = np.nan

news_df['clean_text'] = [clean_text(x) for x in news_df['title']]

news_df.dropna(subset=['clean_text'], inplace=True)
news_df.drop_duplicates(subset=['text', 'clean_text'], keep=False, inplace=True)

news_df.head()

Unnamed: 0,title,text,subject,date,label,clean_text
0,Syria Strike Completely Backfires Against Tru...,"In case you just woke up, the United States is...",News,"April 7, 2017",0,syria strike completely backfire trump even ha...
1,Russia already preparing military withdrawal f...,MOSCOW (Reuters) - Russian Security Council se...,worldnews,"November 30, 2017",1,russia already prepare military withdrawal syr...
2,Breaking: President Trump Pardons Sheriff Joe ...,Reuters is reporting: U.S. President Donald T...,politics,"Aug 25, 2017",0,break president trump pardon sheriff joe arpaio
3,WOW! DONNA BRAZILE Tells Critics Of Her Book O...,Donna Brazile might want to watch her back! Sh...,politics,"Nov 5, 2017",0,wow donna brazile tell critics book dnc go hel...
4,Robert Reich Just Ruined A Trump Supporter’s ...,You ve got to love Robert Reich. The former Se...,News,"September 19, 2016",0,robert reich ruin trump supporter week facts


In [15]:
tweet_df['clean_text'] = np.nan

tweet_df['clean_text'] = [clean_text(x) for x in tweet_df['text']]

tweet_df.dropna(subset=['clean_text'], inplace=True)
tweet_df.drop_duplicates(subset=['text', 'clean_text'], keep=False, inplace=True)

tweet_df.head()

Unnamed: 0,text,language,clean_text
1,RT @DemonSlayerUSA: #NEWS Get ready to embark ...,en,rt get ready embark new mission within swordsm...
3,RT @Stephen03876276: The #hero is here. #tshir...,en,rt detail
4,RT @Stephen03876276: My #new #dog #tshirt\n\nM...,en,rt detail
5,#Read the latest #blues #news on JAM #Radio #b...,en,latest jam update every day click


In [26]:
clean_text_combined = pd.concat([news_df['clean_text'], tweet_df['clean_text']])
clean_text_combined.head()
print("All clean texts combined size: ", clean_text_combined.size)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_text_combined)

def tokenize(text):
    tmp_text = tokenizer.texts_to_sequences(text)
    tmp_text = pad_sequences(tmp_text, padding='post', maxlen=232)
    return tmp_text

All clean texts combined size:  33799


In [30]:
tweet_df['sequence'] = np.nan
tweet_df['sequence'] = list(tokenize(tweet_df['clean_text']))

tweet_df.head()

Unnamed: 0,text,language,clean_text,sequence
1,RT @DemonSlayerUSA: #NEWS Get ready to embark ...,en,rt get ready embark new mission within swordsm...,"[5010, 14, 367, 14726, 7, 1758, 1340, 14727, 3..."
3,RT @Stephen03876276: The #hero is here. #tshir...,en,rt detail,"[5010, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,RT @Stephen03876276: My #new #dog #tshirt\n\nM...,en,rt detail,"[5010, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
5,#Read the latest #blues #news on JAM #Radio #b...,en,latest jam update every day click,"[364, 1591, 1494, 815, 213, 14729, 0, 0, 0, 0,..."


In [31]:
news_df['sequence'] = np.nan
news_df['sequence'] = list(tokenize(news_df['clean_text']))

news_df.head()

Unnamed: 0,title,text,subject,date,label,clean_text,sequence
0,Syria Strike Completely Backfires Against Tru...,"In case you just woke up, the United States is...",News,"April 7, 2017",0,syria strike completely backfire trump even ha...,"[68, 214, 1249, 1527, 1, 327, 661, 5034, 233, ..."
1,Russia already preparing military withdrawal f...,MOSCOW (Reuters) - Russian Security Council se...,worldnews,"November 30, 2017",1,russia already prepare military withdrawal syr...,"[11, 910, 625, 104, 1402, 68, 720, 0, 0, 0, 0,..."
2,Breaking: President Trump Pardons Sheriff Joe ...,Reuters is reporting: U.S. President Donald T...,politics,"Aug 25, 2017",0,break president trump pardon sheriff joe arpaio,"[51, 16, 1, 2217, 1092, 584, 2348, 0, 0, 0, 0,..."
3,WOW! DONNA BRAZILE Tells Critics Of Her Book O...,Donna Brazile might want to watch her back! Sh...,politics,"Nov 5, 2017",0,wow donna brazile tell critics book dnc go hel...,"[829, 5549, 5550, 46, 1444, 1025, 807, 34, 973..."
4,Robert Reich Just Ruined A Trump Supporter’s ...,You ve got to love Robert Reich. The former Se...,News,"September 19, 2016",0,robert reich ruin trump supporter week facts,"[1763, 3114, 1474, 1, 338, 299, 1184, 0, 0, 0,..."
