In [42]:
import tweepy
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nikla\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [43]:
# Add your Bearer token here.
client = tweepy.Client(bearer_token='')

# Create Pandas dataframe with three columns
tweet_df = pd.DataFrame(columns=['text', 'language'])

# Query tweets with #news. Show only 100 tweets
query = '#news'
tweets = client.search_recent_tweets(query=query, tweet_fields=['lang'], max_results=10)

for tweet in tweets.data:
    text = tweet.text
    language = tweet.lang

    # Focus only to tweets in english
    if language == 'en':
        tmp = [text, language]
        tweet_df.loc[len(tweet_df)] = tmp

# Total amount of tweets in dataframe
print('Amount: ', len(tweet_df))

# Show first 5 lines of dataframe
tweet_df.head()

Amount:  6


Unnamed: 0,text,language
0,Is Covid-19 more dangerous than driving? How s...,en
1,The iPhone 14 lineup does not include the Mini...,en
2,RT @vikeor: Check out this interview 😎\n\nLesz...,en
3,• #NTA has started the #NEET UG 2022 registrat...,en
4,"Ukraine says 6 people dead in ""powerful"" attac...",en


In [44]:
truenews = pd.read_csv("../files/True.csv")
fakenews = pd.read_csv("../files/Fake.csv")

truenews['label'] = 1
fakenews['label'] = 0

news_df = pd.concat([truenews, fakenews])
news_df = news_df.sample(frac = 1).reset_index(drop=True)
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,"Tillerson to meet Myanmar general, stress need...",YANGON (Reuters) - U.S. Secretary of State Rex...,worldnews,"November 14, 2017",1
1,Commerce Secretary says Trump-Xi talks will ad...,BEIJING (Reuters) - Meetings between U.S. Pres...,worldnews,"November 8, 2017",1
2,Trump looks at retired general Flynn as possib...,WASHINGTON (Reuters) - Presumptive Republican ...,politicsNews,"July 9, 2016",1
3,Ex-congressman indicted on 24 counts in spendi...,CHICAGO (Reuters) - A former U.S. representati...,politicsNews,"November 10, 2016",1
4,TRUMP SAYS “YES” To Federal Funding For Planne...,"Sounds like a great deal, right? After all, Pl...",left-news,"Mar 7, 2017",0


In [45]:
print("Does Twitter Dataframe contain null values: ", tweet_df.isnull().values.any())
print("Does News Dataframe contain null values: ", news_df.isnull().values.any(), "\n")

print("Twitter dataframe datatypes: \n", tweet_df.dtypes, "\n")
print("News dataframe datatypes: \n", news_df.dtypes)

Does Twitter Dataframe contain null values:  False
Does News Dataframe contain null values:  False 

Twitter dataframe datatypes: 
 text        object
language    object
dtype: object 

News dataframe datatypes: 
 title      object
text       object
subject    object
date       object
label       int64
dtype: object


In [46]:
def deEmojify(text):
    regex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regex_pattern.sub(r'', text)

def clean_text(tweet):
    # Lower case text
    tweet = tweet.lower()

    # Remove mentions
    tweet = re.sub("@[A-Za-z0-9]+","", tweet)

    # Remove links
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)

    # Remove hastags
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)

    # Remove punctuations
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)

    # Remove emoticons
    tweet = deEmojify(tweet)

    # Filter non-alphanumeric characters (double check...)
    tweet = re.sub("[^a-z0-9]"," ", tweet)

    # Tokenize tweet (split...)
    tokens = word_tokenize(tweet)

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]

    cleaned_tweet = ' '.join(tokens)

    return cleaned_tweet

test_text = "The @SpaceX Dragon Endeavour with four @Axiom_Space astronauts is holding 20 meters away from the station as the station crew works a video routing issue. https://nasa.gov/live"
print("Before: ", test_text)
print("After: ", clean_text(test_text))


Before:  The @SpaceX Dragon Endeavour with four @Axiom_Space astronauts is holding 20 meters away from the station as the station crew works a video routing issue. https://nasa.gov/live
After:  dragon endeavour four space astronauts hold 20 meter away station station crew work video rout issue


In [47]:
news_df['clean_text'] = np.nan

news_df['clean_text'] = [clean_text(x) for x in news_df['title']]

news_df.dropna(subset=['clean_text'], inplace=True)
news_df.drop_duplicates(subset=['text', 'clean_text'], keep=False, inplace=True)

news_df.head()

Unnamed: 0,title,text,subject,date,label,clean_text
0,"Tillerson to meet Myanmar general, stress need...",YANGON (Reuters) - U.S. Secretary of State Rex...,worldnews,"November 14, 2017",1,tillerson meet myanmar general stress need sto...
1,Commerce Secretary says Trump-Xi talks will ad...,BEIJING (Reuters) - Meetings between U.S. Pres...,worldnews,"November 8, 2017",1,commerce secretary say trump xi talk address t...
2,Trump looks at retired general Flynn as possib...,WASHINGTON (Reuters) - Presumptive Republican ...,politicsNews,"July 9, 2016",1,trump look retire general flynn possible run mate
3,Ex-congressman indicted on 24 counts in spendi...,CHICAGO (Reuters) - A former U.S. representati...,politicsNews,"November 10, 2016",1,ex congressman indict 24 count spend scandal
5,Obama will not block North Korea sanctions bil...,ABOARD AIR FORCE ONE (Reuters) - The White Hou...,politicsNews,"February 12, 2016",1,obama block north korea sanction bill white house


In [48]:
tweet_df['clean_text'] = np.nan

tweet_df['clean_text'] = [clean_text(x) for x in tweet_df['text']]

tweet_df.dropna(subset=['clean_text'], inplace=True)
tweet_df.drop_duplicates(subset=['text', 'clean_text'], keep=False, inplace=True)

tweet_df.head()

Unnamed: 0,text,language,clean_text
0,Is Covid-19 more dangerous than driving? How s...,en,covid 19 dangerous drive scientists parse covi...
1,The iPhone 14 lineup does not include the Mini...,en,iphone 14 lineup include mini larger dual came...
3,• #NTA has started the #NEET UG 2022 registrat...,en,start ug 2022 registrations 6 april candidates...
4,"Ukraine says 6 people dead in ""powerful"" attac...",en,ukraine say 6 people dead powerful attack west...


In [49]:
clean_text_combined = pd.concat([news_df['clean_text'], tweet_df['clean_text']])
clean_text_combined.head()
print("All clean texts combined size: ", clean_text_combined.size)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_text_combined)

def tokenize(text):
    tmp_text = tokenizer.texts_to_sequences(text)
    tmp_text = pad_sequences(tmp_text, padding='post', maxlen=232)
    return tmp_text

All clean texts combined size:  33799


In [50]:
tweet_df['sequence'] = np.nan
tweet_df['sequence'] = list(tokenize(tweet_df['clean_text']))

tweet_df.head()

Unnamed: 0,text,language,clean_text,sequence
0,Is Covid-19 more dangerous than driving? How s...,en,covid 19 dangerous drive scientists parse covi...,"[9514, 1616, 723, 627, 2266, 8293, 9514, 697, ..."
1,The iPhone 14 lineup does not include the Mini...,en,iphone 14 lineup include mini larger dual came...,"[5103, 1284, 8719, 1019, 6432, 7420, 3942, 188..."
3,• #NTA has started the #NEET UG 2022 registrat...,en,start ug 2022 registrations 6 april candidates...,"[274, 14727, 7448, 7639, 881, 1576, 772, 221, ..."
4,"Ukraine says 6 people dead in ""powerful"" attac...",en,ukraine say 6 people dead powerful attack west...,"[554, 3, 881, 87, 295, 889, 27, 1328, 367, 147..."


In [53]:
news_df['sequence'] = np.nan
news_df['sequence'] = list(tokenize(news_df['clean_text']))

news_df.head()

Unnamed: 0,title,text,subject,date,label,clean_text,sequence
0,"Tillerson to meet Myanmar general, stress need...",YANGON (Reuters) - U.S. Secretary of State Rex...,worldnews,"November 14, 2017",1,tillerson meet myanmar general stress need sto...,"[185, 35, 157, 217, 2075, 154, 180, 351, 13, 1..."
1,Commerce Secretary says Trump-Xi talks will ad...,BEIJING (Reuters) - Meetings between U.S. Pres...,worldnews,"November 8, 2017",1,commerce secretary say trump xi talk address t...,"[1375, 210, 3, 1, 411, 36, 734, 146, 7449, 0, ..."
2,Trump looks at retired general Flynn as possib...,WASHINGTON (Reuters) - Presumptive Republican ...,politicsNews,"July 9, 2016",1,trump look retire general flynn possible run mate,"[1, 290, 1603, 217, 418, 412, 152, 2428, 0, 0,..."
3,Ex-congressman indicted on 24 counts in spendi...,CHICAGO (Reuters) - A former U.S. representati...,politicsNews,"November 10, 2016",1,ex congressman indict 24 count spend scandal,"[135, 559, 2136, 1402, 1565, 352, 509, 0, 0, 0..."
5,Obama will not block North Korea sanctions bil...,ABOARD AIR FORCE ONE (Reuters) - The White Hou...,politicsNews,"February 12, 2016",1,obama block north korea sanction bill white house,"[6, 252, 20, 21, 111, 15, 8, 5, 0, 0, 0, 0, 0,..."
