In [1]:
!pip install emot
!pip install demoji
!pip install emoji

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import demoji
demoji.download_codes()

import emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 6.4 kB/s 
[?25hInstalling collected packages: emot
Successfully installed emot-3.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 980 kB/s 
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 4.3 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl si

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
  # This is added back by InteractiveShellApp.init_path()


In [2]:
import json
import pandas as pd
import numpy as np
import regex as re
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [3]:
# Connnect to google drive 
from google.colab import drive
drive.mount('/content/gdrive')

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Mounted at /content/gdrive


#Load and organise twitter data

### Functions for loading and organising twitter data

In [4]:
def matching_items_in_directory(dir, inclusion_term):
    q =  dir + " in parents and trashed=false"
    file_list = drive.ListFile({'q':q}).GetList() 
    name_list = [file['title'] for file in file_list if inclusion_term in file['title']]

    return name_list

In [5]:
def clean_combine_subfolder_tweets(subfolder_list):
	df_list = []

	for s in subfolder_list:
		path = ('/content/gdrive/MyDrive/NLP-exam/twitterdata/' + s + '/tweets.json')
	
		with open(path, 'r') as f:
			data = json.load(f)
	 
		data_df = pd.DataFrame(data.items(), columns=['Url', 'Tweet'])
		data_df['Date'] = (s.split("demic",1)[1].split("-", 1)[0] + ".2021") #extract date to allow time analysis
		df_list.append(data_df)

	df_combined = pd.concat(df_list, ignore_index=True)
 
	df_combined[['A', 'B', 'C', 'User', 'Type', 'PostID']] = df_combined['Url'].str.split("/", expand=True) #retreive information from URL
	df_combined.drop(['A', 'B','C', 'Url'], axis=1, inplace=True)
	#df_combined['Date'] = (s.split("demic",1)[1].split("-", 1)[0] + ".2021") #extract date to allow time analysis
	df_combined.Date = pd.to_datetime(df_combined.Date, format='%d.%m.%Y')
	df_combined.PostID = df_combined.PostID.astype(int) #PostID as int for comparison and sorting
	df_combined['InclKey'] = re.sub(r'[^a-z]', '', s) #extracting context keyword
			
	return df_combined

### Running functions to load and organise twitter data

In [6]:
#create plandemic and scamdemic list of subdirectories
b = "\'1WwYhT1PPii8x-O4GLnMm54E9DnMrcTED\'"
plandemic_subfolders = matching_items_in_directory(b, 'plandemic')
scamdemic_subfolders = matching_items_in_directory(b, 'scamdemic')

#clean and create dataset for each context
df_plandemic = clean_combine_subfolder_tweets(plandemic_subfolders)
df_scamdemic = clean_combine_subfolder_tweets(scamdemic_subfolders)

#separate dataframes
print(df_plandemic.shape)
print(df_scamdemic.shape)

#combined dataframes
df = pd.concat([df_plandemic, df_scamdemic], ignore_index=True)

(105482, 6)
(107587, 6)


# Preprocessing twitter data

### FUnctions for handling preprocessing steps

In [None]:
def emojis_to_text(df):
  '''Translates emojis to text descriptions according to emoji library's dictionary. '''
  #convert emoticons/emojis to text description
  for index, select_row in df.iterrows():
    df.at[index, 'Tweet'] = emoji.demojize(select_row.Tweet, delimiters=("", ""))

  return df


def extract_tweet_features(df):
  '''Extract hashtags, user tags, links and reetweet indication from tweet texts. '''
  #extract features from text
  df['Hashtags'] = df.Tweet.apply(lambda x: re.findall(r"(#\w+)", x))
  df['User_tags'] = df.Tweet.apply(lambda x: re.findall(r"(@\w+)", x))
  df['Linked_to'] = df.Tweet.apply(lambda x: re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', x))
  df['RT'] = df.Tweet.str.startswith('RT', na=False)

  #insert nan in tweet features columns
  df.Hashtags = df.Hashtags.apply(lambda x: np.nan if len(x)==0 else x)
  df.User_tags = df.User_tags.apply(lambda y: np.nan if len(y)==0 else y)
  df.Linked_to = df.Linked_to.apply(lambda z: np.nan if len(z)==0 else z)

  return df


def clean_tweets(df):
  ''' Removing unncessary features in the tweet texts.'''

  #removing url's, user tags, special characters except punctuations, 
  regex_list = [r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r'(@\w+)', r'[^_\-\w\s\ ]', r'\b[0-9\.]+\b', "[^a-zA-Z0-9]+", r'\bRT\b', r'\brt\b', r'\s\s+'] 
  replace_list = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

  for i in range(len(regex_list)):
    df.Tweet = df.Tweet.str.replace(regex_list[i], replace_list[i]) 

  #capitalisation
  df.Tweet = df.Tweet.str.lower()

  word_list, replace_word = ['scamdemic', 'plandemic'], [' ', ' ']

  for i in range(len(word_list)):
    df.Tweet = df.Tweet.str.replace(word_list[i], replace_word[i]) 

  df.Tweet = df.Tweet.str.encode('ascii', 'ignore').str.decode('ascii')

  return df


def handle_duplicates_nans(df):
  '''remove tweets with same text and username and very short tweets '''

  #removing duplicates from the same user
  df = df.drop_duplicates(subset=['Tweet', 'User'], keep='last')
  
  #removing retweets without added response 
  #df = df.drop_duplicates(subset=['Tweet', 'RT'], keep='last')
  df = df.loc[~(df.Tweet.duplicated(keep=False) & df.RT.ne(True))]

  #remove duplicate tweets
  df = df.drop_duplicates(subset ="Tweet", keep = False)

  df = df[df['Tweet'].str.len()>=10]

  return df


def concentrate_tweets(df):
  ''' remove frequent words without contextual meaning and word variations'''

  #lemmatization
  lemmatizer = WordNetLemmatizer()
  df.Tweet = df.Tweet.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

  #remove stop-words
  stop_words = stopwords.words('english')
  df.Tweet = df.Tweet.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

  #remove words shorter than 3 characters (cutoff here due to )  
  df.Tweet = df.Tweet.apply(lambda x: ' '.join([word for word in x.split() if len(word) > 3]))

  return df


def tokenize_tweets(df):
  '''add column to dataset with tweet tokens'''

  tokenizer = TweetTokenizer()
  tokenized = df.Tweet.apply(tokenizer.tokenize)

  df['Tokenized_tweets'] = tokenized
  
  return df


def preprocess_tweets(df):
  ''' Calling preprocessing steps to handle tweets before it is fed into model. '''
  print('Original shape:{0}'.format(df.shape))
  
  display(df)
  df = emojis_to_text(df)
  print(df.iloc[213067]['Tweet'])
  df = extract_tweet_features(df)
  df = clean_tweets(df)
  print(df.iloc[213067]['Tweet'])
  print('After cleaning tweets:{0}'.format(df.shape))

  df = handle_duplicates_nans(df)
  print('After handling dulicates and nans:{0}'.format(df.shape))
 
  df = concentrate_tweets(df)
  df = tokenize_tweets(df)

  df.reset_index(inplace=True)
  df.drop(['index'], axis=1, inplace=True)

  return df

### Running functions for preprocessing

In [None]:
#Run preprocessing of tweets
df = preprocess_tweets(df)

Original shape:(213069, 6)


Unnamed: 0,Tweet,Date,User,Type,PostID,InclKey
0,RT @libertytarian: 1st day of pastor's trial f...,2021-04-30,SyrianArchangle,status,1390250830956318721,plandemic
1,RT @sjaanie123: Kijk uit voor de gevaccineerde...,2021-04-30,SaskiaDev,status,1390250693534097408,plandemic
2,@Bameron2 You’re boyfriend must be proud of yo...,2021-04-30,Phoenix1783,status,1390250538026086400,plandemic
3,Kijk uit voor de gevaccineerden mensen! #COVID...,2021-04-30,sjaanie123,status,1390249938118905856,plandemic
4,"RT @BartosVl: Covid inženýři z celého světa, j...",2021-04-30,Lubomr12663547,status,1390249162176974848,plandemic
...,...,...,...,...,...,...
213064,"RT @SJJB55: Sept 2020, sat in our village (pop...",2021-04-21,mherbert617,status,1384048634195431432,scamdemic
213065,@BhupinderJaryal This is ..scamdemic,2021-04-21,SharmaSandeep41,status,1384047994165628932,scamdemic
213066,"RT @SJJB55: Sept 2020, sat in our village (pop...",2021-04-21,louisrossilevy,status,1384047719354753025,scamdemic
213067,"Guess what , nobody's gonna get the money 🤣😂\n...",2021-04-21,DanielScoland,status,1384047365724663810,scamdemic


Guess what , nobody's gonna get the money rolling_on_the_floor_laughingface_with_tears_of_joy
 #NoVaccinePassport #nwo #scamdemic #plandemic #NoVaccinePassports #hoax #psyop #maskOFF #freedom #NWO #Agenda21 #mindcontrol #MSM #tyrany #medicalgenocide #vaccine #truth https://t.co/9Wg5cPGalz




guess what nobody s gonna get the money rolling on the floor laughingface with tears of joy novaccinepassport nwo     novaccinepassports hoax psyop maskoff freedom nwo agenda21 mindcontrol msm tyrany medicalgenocide vaccine truth 
After cleaning tweets:(213069, 10)
After handling dulicates and nans:(78542, 10)


#Informal tests

In [None]:
display(df.iloc[5500:5551])
display(df.iloc[5500:5551].Tokenized_tweets)
display(df[:3])

#remove duplicates based on user tags
#do dates correclty

Unnamed: 0,Tweet,Date,User,Type,PostID,InclKey,Hashtags,User_tags,Linked_to,RT,Tokenized_tweets
5500,fact checking always biased misleading created...,2021-04-23,KidKool4U,status,1386384766665216001,plandemic,,[@FelixtheAXE],,True,"[fact, checking, always, biased, misleading, c..."
5501,nothing amorphous thing orchestrated happen re...,2021-04-23,georgemas0nn,status,1386383648254054400,plandemic,"[#amorphous, #orchestrated, #agenda, #organic]",[@WenMoonTho],,True,"[nothing, amorphous, thing, orchestrated, happ..."
5502,gotta research vaccine mask entire,2021-04-23,PowerSoula,status,1386382883254853636,plandemic,[#Plandemic],"[@Kareim30184380, @CryptoMessiah]",,False,"[gotta, research, vaccine, mask, entire]"
5503,bill gate practice genocide africa absorb vita...,2021-04-23,dsyunior,status,1386382415380180992,plandemic,"[#Africa, #Plandemic, #EndGame]",,[https://t.co/oV2To6KrEn],False,"[bill, gate, practice, genocide, africa, absor..."
5504,garbage click bait piece torched else tired fa...,2021-04-23,NVPatriotGirl,status,1386381145135144962,plandemic,"[#ufc, #UFC261, #fakenewsmedia, #propaganda, #...","[@reviewjournal, @danawhite]",[https://t.co/ILLtYkbBXC],False,"[garbage, click, bait, piece, torched, else, t..."
5505,wonder many oscar take home tonight,2021-04-23,ethan_douglas,status,1386378505236135937,plandemic,,,[https://t.co/SEopHdINWX],False,"[wonder, many, oscar, take, home, tonight]"
5506,covid used take away people right freedom powe...,2021-04-23,NoConclusion79,status,1386378282707390464,plandemic,,,[https://t.co/2ksHRgprP9],False,"[covid, used, take, away, people, right, freed..."
5507,look proof worldwide scam year test sold world,2021-04-23,JAZZYW257,status,1386378169549209601,plandemic,,,[https://t.co/xmS000Mwjj],False,"[look, proof, worldwide, scam, year, test, sol..."
5508,family member daycare working month child snif...,2021-04-23,ganais_anais,status,1386377427019051014,plandemic,"[#SlipperySlope, #COVID19, #Plandemic]","[@markjam88463456, @existentialrest, @shocking...",,False,"[family, member, daycare, working, month, chil..."
5509,hoax boris struggle sell longer,2021-04-23,Fifi_Borgia,status,1386377425358147584,plandemic,,[@darylpeagram],,True,"[hoax, boris, struggle, sell, longer]"


5500    [fact, checking, always, biased, misleading, c...
5501    [nothing, amorphous, thing, orchestrated, happ...
5502             [gotta, research, vaccine, mask, entire]
5503    [bill, gate, practice, genocide, africa, absor...
5504    [garbage, click, bait, piece, torched, else, t...
5505           [wonder, many, oscar, take, home, tonight]
5506    [covid, used, take, away, people, right, freed...
5507    [look, proof, worldwide, scam, year, test, sol...
5508    [family, member, daycare, working, month, chil...
5509                [hoax, boris, struggle, sell, longer]
5510    [really, throwing, state, hospital, medical, p...
5511     [medicine, work, malaria, work, covid, together]
5512                [hoax, boris, struggle, sell, longer]
5513    [sage, comprises, best, scientific, mind, expe...
5514    [pandemic, control, used, virus, expressionles...
5515    [always, evil, russian, hacker, complete, warm...
5516    [nothing, company, started, animal, testing, k...
5517    [londo

Unnamed: 0,Tweet,Date,User,Type,PostID,InclKey,Hashtags,User_tags,Linked_to,RT,Tokenized_tweets
0,shop class high school shop teacher walked for...,2021-04-30,GingrichVera,status,1390227188851556353,plandemic,,[@tr6_guy],,False,"[shop, class, high, school, shop, teacher, wal..."
1,smell meltdown biblical proportion wind leftwa...,2021-04-30,Emm_Down_Under,status,1390186145410031616,plandemic,,[@CailinGaeilge8],,True,"[smell, meltdown, biblical, proportion, wind, ..."
2,guess bandwagon tweeps pushing something blame...,2021-04-30,NICHONASRl,status,1390185828236861440,plandemic,[#LifeBeforeCovid],[@njenga_richard],,True,"[guess, bandwagon, tweeps, pushing, something,..."


In [None]:
print(df["Date"].value_counts())

2021-06-04    5992
2021-05-14    4656
2021-04-17    4427
2021-05-20    4262
2021-05-07    4112
2021-04-10    4100
2021-04-23    3876
2021-04-21    3874
2021-03-31    3828
2021-05-27    3633
2021-03-08    3362
2021-04-30    3261
2021-04-07    3243
2021-05-26    3102
2021-06-02    3087
2021-03-17    3073
2021-05-12    3024
2021-04-28    2996
2021-05-05    2919
2021-04-14    2813
2021-05-19    2684
2021-03-24    2218
Name: Date, dtype: int64


In [None]:
#check tht tweets don't contain words
searchfor = ['plandemic', 'scamdemic']
df.Tweet[df.Tweet.str.contains('|'.join(searchfor))]

Series([], Name: Tweet, dtype: object)

In [None]:
#print(df.loc[df.User == 'INSANLEAGUE'].Tweet)

#print(df.loc[df.User == 'jamesmcb44'].Tweet)
#print(df.loc[df.User == '____BB____1'].Tweet)

#print(df.Tweet[50888])
#print(df.Tweet[45064])

In [None]:
  #if blacklist:
  #  tokenized_blacklist = []
  #  for i, tokenized_item in enumerate(tokenized):
  #    tokenized_blacklist.append([t for t in tokenized_item if t not in blacklist])
  #  df['Tokenized_tweets'] = tokenized_blacklist
  #else:
  #  df['Tokenized_tweets'] = tokenized