# Clustering Social Media Data - Cleaning Data

Importing libraries

In [9]:
import numpy as np
import pandas as pd
import pickle
from os import path as Path
import datetime
import re

# Importing Data

In [4]:
# input data dir
s3_data_dir="s3://ds-rg271/data"

In [6]:
#input data file
input_data_uri = Path.join(s3_data_dir,"labelled/mebank_tweets_1_year_labelled.csv")
print(input_data_uri)

s3://ds-rg271/data/labelled/mebank_tweets_1_year_labelled.csv


In [7]:
input_data=pd.read_csv(input_data_uri)
input_data.tail(1)

Unnamed: 0.1,Unnamed: 0,date,content,complaint,topic,content_type,user,url
896,896,2020-09-21 02:30:03+00:00,"ME Bank looks to e-signatures, digital loan pr...",0.0,,twitter/mention,MrtgBusiness,https://twitter.com/MrtgBusiness/status/130786...


# Cleaning Data

In [10]:
# standardizing date-time format
for i, date_str in enumerate(input_data['date']):
	#date_str=re.sub('\+00:00', '', date_str)
	#input_data.loc[i,'date']=datetime.datetime.strptime(date_str,'%Y-%m-%d %H:%M:%S')
	input_data.loc[i,'date']=datetime.datetime.strptime(date_str,"%Y-%m-%dT%H:%M:%S.%fZ")



In [11]:
# sorting data based on date
input_data_sorted=input_data.sort_values(by=['date'],ignore_index=True).reset_index(drop=True)
input_data_sorted.shape

(897, 8)

In [15]:
# removing 0.5 and -1 labels 
input_data_01=input_data_sorted[input_data_sorted.complaint.isin([0, 1])].reset_index(drop=True)
print(input_data_01.shape)
input_data_01.tail(1)

(862, 8)


Unnamed: 0.1,Unnamed: 0,date,content,complaint,topic,content_type,user,url
861,573,2021-07-25 09:22:13,Making beautiful banking and helping Australi...,0.0,,twitter/mention,sandybeech4,https://twitter.com/sandybeech4/status/1419226...


In [20]:
# Ensure all labels are the same by making them lower case and stripping trailing whitespace
input_data_01["topic"] = input_data_01["topic"].str.lower().str.strip()

# Ensure complaint is integer
input_data_01["complaint"] = input_data_01["complaint"].astype(int)

# Fix missed problem - other
input_data_01.loc[input_data_01["topic"] == "problem - other", "topic"] = "problem/others"

In [17]:
input_data_01.tail(1)

Unnamed: 0.1,Unnamed: 0,date,content,complaint,topic,content_type,user,url
861,573,2021-07-25 09:22:13,Making beautiful banking and helping Australi...,0,,twitter/mention,sandybeech4,https://twitter.com/sandybeech4/status/1419226...


In [18]:
#saving clean data in local directory 
input_data_01.to_csv("../data/mebank_tweets_1_year_clean.csv", index=False)

#saving clean data in s3 bucket 
#input_data_01.to_csv(f"{s3_data_dir}/labelled/mebank_tweets_1_year_cleaned.csv", index=False)