## This is a code to clean the data for customer feedback clustering and topic modeling
### Authors: Mahdi Rasouli, Amir Abdollahi, Christian Bonato

### Importing the necessary libraries

In [37]:
import numpy as np
import pandas as pd
from os import path as Path
import datetime
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'mebank','um','umm','ummm','hi','hello','hey','heyyyyy','fyi'])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amir.abdollahi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amir.abdollahi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Defining the functions

In [38]:
def preprocess_text(text, flg_clean=True, flg_tweet=True, flg_stemm=False, flg_lemm=False, lst_stopwords=None):

	## Tweet preprocessor
	if flg_tweet == True:
		import preprocessor as p
		# remove url, mention,emoji, smily, and numbers (keeping hashtags)
		p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
		text=p.clean(text)

	## Tweet preprocessor
	if flg_clean == True:

		# Remove mentions
		text = re.sub(r'/^(?!.*\bRT\b)(?:.+\s)?@\w+/i', '', text)
		text = re.sub('@', '', text)
		
		# Replace Emails
		text = re.sub('\S*@\S*\s?', '', text)

		# Remove links
		text = re.sub('http\S*', '', text)

		# clean hashtags (just removing the hashtag)
		# #text = re.sub('#\S*', '', text)
		text = re.sub('#', '', text)

		# Remove unacceptable characters/emojis
		text = re.sub('\S*ü\S*\s?', '', text)
		text = re.sub('\S*ò\S*\s?', '', text)
		text = re.sub('\S*ä\S*\s?', '', text)
		text = re.sub('\S*ô\S*\s?', '', text)


		# Remove new line characters
		text = re.sub('\s+', ' ', text)

		# convert to lower case
		text=text.lower()

		## clean (convert to lowercase and remove punctuations and characters and then strip)
		#text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
		
		# Remove distracting single quotes
		#text = [re.sub("\'", "", sent) for sent in text]

			
	## Tokenize (convert from string to list)
	lst_text = text.split()    ## remove Stopwords

	if lst_stopwords is not None:
		lst_text = [word for word in lst_text if word not in lst_stopwords]
		
	## Stemming (remove -ing, -ly, ...)
	if flg_stemm == True:
		ps = nltk.stem.porter.PorterStemmer()
		lst_text = [ps.stem(word) for word in lst_text]
		
	## Lemmatisation (convert the word into root word)
	if flg_lemm == True:
		lem = nltk.stem.wordnet.WordNetLemmatizer()
		lst_text = [lem.lemmatize(word) for word in lst_text]
		
	## back to string from list
	text = " ".join(lst_text)
	
	return text

### Reading the input data from AWS S3 


In [39]:
s3_data_dir="s3://ds-rg271/data"
input_data_url = Path.join(s3_data_dir,"labelled/mebank_tweets_1_year_labelled.csv")
input_data=pd.read_csv(input_data_url)

In [40]:
print("Shape of the input data is:", input_data.shape)

Shape of the input data is: (897, 8)


### Removing duplicate records with the same content (tweet) and date 


In [41]:
print("Number of removed duplicate records is:", len(input_data) - len(input_data.drop_duplicates(["content","date"])))
input_data.drop_duplicates(["content","date"], inplace = True)
input_data.reset_index(inplace= True)

Number of removed duplicate records is: 21


### Changing the format of the tweet dates to datetime

In [42]:
for i, date_str in enumerate(input_data['date']):
	date_str=re.sub('\+00:00', '', date_str)
	input_data.loc[i,'date']=datetime.datetime.strptime(date_str,'%Y-%m-%d %H:%M:%S')

### Sorting the data based on date

In [43]:
input_data_sorted=input_data.sort_values(by=['date'],ignore_index=True).reset_index(drop=True)
print("Shape of the sorted data is:", input_data_sorted.shape)

Shape of the sorted data is: (876, 9)


### Keeping only complaint 1 and no-complaint 0 records (Removing 0.5 and -1 labels)

In [44]:
input_data_01=input_data_sorted[input_data_sorted.complaint.isin([0, 1])].reset_index(drop=True)
input_data_01.shape

(842, 9)

### Ensure all labels are the same by making them lower case and stripping trailing whitespace


In [45]:
input_data_01["topic"] = input_data_01["topic"].str.lower().str.strip()

### Ensure that complaints are integer (0 or 1)

In [46]:
input_data_01["complaint"] = input_data_01["complaint"].astype(int)

### Fixing the missed problem - other

In [47]:
input_data_01.loc[input_data_01["topic"] == "problem - other", "topic"] = "problem/others"

### Uploading the cleaned data to S3

In [48]:
input_data_01.to_csv(f"{s3_data_dir}/labelled/mebank_tweets_1_year_cleaned.csv", index=False)

### Preprocessing the input text

In [49]:
input_data_clean=input_data_01.copy()
for i in range(0,len(input_data_clean['content'])):
	input_data_clean.loc[i,'content_clean']=preprocess_text(input_data_clean.loc[i,'content'], flg_clean=True, flg_tweet=True, flg_stemm=False, flg_lemm=False, lst_stopwords=None)
# Selecting the relevant features
input_data_clean=input_data_clean[['date','content','content_clean','complaint','topic']]

In [None]:
input_data_clean.shape

### Removing the empty records in "content_clean" 

In [None]:
input_data_clean.replace("", float("NaN"), inplace=True)
input_data_clean.dropna(subset = ["content_clean"], inplace=True)

In [None]:
input_data_clean.shape

### Uploading the preprocessed data to S3

In [50]:
input_data_clean.to_csv(f"{s3_data_dir}/preprocessed/mebank_tweets_1_year_preprocessed.csv", index=False)