# Clustering Social Media Data (Preprocessing Data)

Importing libraries

In [9]:
import numpy as np
import pandas as pd
import pickle
from os import path as Path
import re

importing nltk for preprocessing and defining stopwords (not used with transformers)

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'mebank','um','umm','ummm','hi','hello','hey','heyyyyy','fyi'])

In [14]:
#installing tweet-preprocessor
#!pip install tweet-preprocessor
import preprocessor as p

# Importing the data

In [11]:
input_data=pd.read_csv("../data/mebank_tweets_1_year_clean.csv")
input_data.tail(1)

Unnamed: 0.1,Unnamed: 0,date,content,complaint,topic,content_type,user,url
861,573,2021-07-25 09:22:13,Making beautiful banking and helping Australi...,0,,twitter/mention,sandybeech4,https://twitter.com/sandybeech4/status/1419226...


# Preprocessing the data

In [16]:
def preprocess_text(text, flg_clean=True, flg_tweet=True, flg_stemm=False, flg_lemm=False, lst_stopwords=None):

	## Tweet preprocessor
	if flg_tweet == True:
		import preprocessor as p
		# remove url, mention,emoji, smily, and numbers (keeping hashtags)
		p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
		text=p.clean(text)

	## Tweet preprocessor
	if flg_clean == True:

		# Remove mentions
		text = re.sub(r'/^(?!.*\bRT\b)(?:.+\s)?@\w+/i', '', text)
		text = re.sub('@', '', text)
		
		# Replace Emails
		text = re.sub('\S*@\S*\s?', '', text)

		# Remove links
		text = re.sub('http\S*', '', text)

		# clean hashtags (just removing the hashtag)
		# #text = re.sub('#\S*', '', text)
		text = re.sub('#', '', text)

		# Remove unacceptable characters/emojis
		text = re.sub('\S*ü\S*\s?', '', text)
		text = re.sub('\S*ò\S*\s?', '', text)
		text = re.sub('\S*ä\S*\s?', '', text)
		text = re.sub('\S*ô\S*\s?', '', text)


		# Remove new line characters
		text = re.sub('\s+', ' ', text)

		# convert to lower case
		text=text.lower()

		## clean (convert to lowercase and remove punctuations and characters and then strip)
		#text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
		
		# Remove distracting single quotes
		#text = [re.sub("\'", "", sent) for sent in text]

			
	## Tokenize (convert from string to list)
	lst_text = text.split()    ## remove Stopwords

	if lst_stopwords is not None:
		lst_text = [word for word in lst_text if word not in lst_stopwords]
		
	## Stemming (remove -ing, -ly, ...)
	if flg_stemm == True:
		ps = nltk.stem.porter.PorterStemmer()
		lst_text = [ps.stem(word) for word in lst_text]
		
	## Lemmatisation (convert the word into root word)
	if flg_lemm == True:
		lem = nltk.stem.wordnet.WordNetLemmatizer()
		lst_text = [lem.lemmatize(word) for word in lst_text]
		
	## back to string from list
	text = " ".join(lst_text)
	
	return text

Exploring preprocessing output using text examples

In [23]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
p.clean('yes system down. @marchall #mebank #route http:yes.com mebank Not noot playing umm we would look into these \n yes, 2nd line 1200')

'yes system down. #mebank #route mebank Not noot playing umm we would look into these yes, nd line'

In [24]:
preprocess_text('yes @marchall. #mebank #route http:yes.com mebank not playing umm we would look into these \n yes, 2nd line 1200', \
	flg_tweet=True, flg_clean=True, flg_stemm=False, flg_lemm=False, lst_stopwords=None)

'yes . mebank route mebank not playing umm we would look into these yes, nd line'

In [28]:
for i in range(0,len(input_data['content'])):
	input_data.loc[i,'content_clean']=preprocess_text(input_data.loc[i,'content'], flg_clean=True, flg_tweet=True, flg_stemm=False, flg_lemm=False, lst_stopwords=None)


In [30]:
input_data=input_data[['date','content','content_clean','complaint','topic']]
input_data.tail(1)

Unnamed: 0,date,content,content_clean,complaint,topic
861,2021-07-25 09:22:13,Making beautiful banking and helping Australi...,making beautiful banking and helping australia...,0,


In [32]:
#saving the preprocessed data in local dir
input_data.to_csv("../data/mebank_tweets_1_year_preprocessed.csv", index=False)

#saving the preprocessed data in s3 bucket
#input_data.to_csv(f"{s3_data_dir}/preprocessed/mebank_tweets_1_year_preprocessed.csv", index=False)