In [None]:
import json
import time
import pandas as pd
import numpy as np
import re
from collections import Counter

In [None]:
sal_data = json.load(open('sal.json'))
raw_suburbs = sal_data.keys()

clean_suburbs = []
suburb_extra_info_1 = []
suburb_extra_info_2 = []
suburb_gcc = []
states = []

state_names = {
 'nsw': 'new south wales',
 'vic': 'victoria',
 'qld': 'queensland',
 'tas': 'tasmania',
 'wa': 'western australia',
 'sa': 'south australia',
 'act': 'australian capital terriroty',
 'wa': 'western australia',
   'nt': 'northern territory' 
}

for suburb in raw_suburbs:
    gcc = sal_data[suburb]['gcc']
    suburb_gcc.append(gcc)

    #check whether there is any additional info in brackets
    extra_info = re.search("\([\w\-\ .]+\)", suburb)

    if extra_info != None:

        #if there is additional info in brackets, check whether it is region info

        suburb = suburb.replace(extra_info.group(0),'').strip() #clean suburb name
        clean_suburbs.append(suburb)
        extra_info_2 = re.search("[\w\ ]+\ \-", extra_info.group(0))

        if extra_info_2 != None:         
            #if there is region info, add it to a 2nd info column            
            suburb_extra_info_2.append(extra_info_2.group(0).replace('-','').strip())
            extra_info_1 = re.search("\-\ \w+", extra_info.group(0))
            suburb_extra_info_1.append(extra_info_1.group(0).replace('-','').replace('.','').strip())

        else:
            suburb_extra_info_2.append(None)
            suburb_extra_info_1.append(extra_info.group(0).replace(')','').replace('(','').replace('.',''))


    #if no state/region info - append suburb as is (no cleaning) & no additional info
    else:
        suburb_extra_info_1.append(None)
        suburb_extra_info_2.append(None)
        clean_suburbs.append(suburb)

suburbs_df = pd.DataFrame(
{'raw_suburb': raw_suburbs,
'gcc': suburb_gcc,
#'state': states,
'clean_suburb': clean_suburbs,
'info_1': suburb_extra_info_1,
'info_2': suburb_extra_info_2}
)

suburbs_df = suburbs_df[~suburbs_df['raw_suburb'].isin(['belconnen (act)',
                                                        'canberra (act)',
                                                        'gungahlin (act)',
                                                        'hall (act)',
                                                        'perth'])]

#find clean suburbs that have more than 1 location
agg_df = suburbs_df.groupby('clean_suburb').count().reset_index()
agg_df.rename(columns={'raw_suburb':'suburb_name_instance_count'}, inplace=True)

suburbs_df = pd.merge(suburbs_df, agg_df[['clean_suburb', 'suburb_name_instance_count']], on='clean_suburb', how='left')
suburbs_df.loc[suburbs_df['raw_suburb']=='jerrabomberra', 'info_1'] = 'nsw'
suburbs_df.loc[suburbs_df['raw_suburb']=='coree', 'info_1'] = 'nsw'


suburbs_df['state'] = suburbs_df['info_1'].map(state_names)

unique_suburb_dict = suburbs_df[suburbs_df['suburb_name_instance_count']==1][['clean_suburb','gcc']].set_index('clean_suburb').to_dict()['gcc']
region_names_dict = suburbs_df[suburbs_df['info_2'].notnull()][['info_2','gcc']].set_index('info_2').to_dict()['gcc']
non_unique_with_state = suburbs_df[(suburbs_df['info_1'].notnull())&(suburbs_df['suburb_name_instance_count']!=1)][['clean_suburb','state','gcc']].set_index(['clean_suburb', 'state']).to_dict()['gcc']

In [None]:
twitter_file = "twitter-huge.json"

In [None]:
topic_map = {'interest rate': 'interest rate', 
             ' rba': 'interest rate',
             "rba decision": 'interest rate', 
             r"rba's decision": 'interest rate',
             'cash rate': 'interest rate', 
             'interest payment': 'interest rate', 
             'interest repayment': 'interest rate', 
             'interest re-payment': 'interest rate',
             'repayment of interest': 'interest rate', 
             'variable interest': 'interest rate',
             'fixed interest': 'interest rate',
             'bank interest': 'interest rate',
             'rate hike': 'interest rate', 
             'mortgage': 'housing', 
             'rent payment': 'housing', 
             'house rent': 'housing', 
             'houserent': 'housing', 
             'house payment': 'housing',
             'housing': 'housing',
             'inflation': 'inflation', 
             'cpi index': 'inflation',
             'cost of living': 'inflation',
             'shrinkflation': 'inflation',
             'social security':'social security', 
             'job seeker':'social security',
             'jobseeker':'social security',
             'youth allowance':'social security',
             'austudy':'social security',
             'centrelink':'social security', 
             'centerlink':'social security'}

In [None]:
f = open(twitter_file, encoding="utf8")

start_time = time.time()

kwd_list = topic_map.keys()
print(kwd_list)
print("Number of search tearms :", len(kwd_list))

timestamp_data = []
text_data = []
location_data = []
gcc_data = []
# state_data = []
search_term_data = []
tweet_ids_data = []
sentiment_data = []
author_id_data = []
coordinates_data = []

for line in f:
    text_re = re.search(r'},"text":"(.*)","sentiment"', line)
    if text_re != None:
        text = text_re.group(1).lower()
        for kwd in kwd_list:
            if kwd in text:
                tweet = json.loads(line.replace(',\n',''))

                tweet_ids_data.append(tweet["id"])
                author_id_data.append(tweet['doc']['data']['author_id'])
                timestamp_data.append(tweet['doc']['data']['created_at'])
                text_data.append(tweet['doc']['data']['text'])
                search_term_data.append(kwd)  
                sentiment_data.append(tweet['doc']['data']['sentiment'])
                
                try:
                    coordinates = tweet['doc']['data']['geo']['coordinates']['coordinates']
                    coordinates_data.append(coordinates)
                except:
                    coordinates_data.append(None)
                
                try:
                    location = tweet['doc']['includes']['places'][0]['full_name']
                    location_data.append(location)
                    location_split = re.split(',',location.lower())  

                    if location_split[0] in unique_suburb_dict:
                        gcc_data.append(unique_suburb_dict[location_split[0]])

                    elif location_split[0] in region_names_dict:
                        gcc_data.append(region_names_dict[location_split[0]])

                    #suburb name not unique
                    elif len(location_split) == 2: #only 2 fields
                        if (location_split[0], location_split[1].strip()) in non_unique_with_state:
                            gcc_data.append(non_unique_with_state[(location_split[0], location_split[1].strip())])
                        else:
                            gcc_data.append(None)


                    elif len(location_split) == 3: #3 fields
                        if (location_split[0], location_split[2].strip()) in non_unique_with_state:
                            gcc_data.append(non_unique_with_state[(location_split[0], location_split[2].strip())])
                        else:
                            gcc_data.append(None)
                    else:
                        gcc_data.append(None)           
                                      
                except: 
                    location_data.append(None)
                    gcc_data.append(None)


    
end_time = time.time()
print(f"The job took {round(end_time - start_time, 3)} seconds to complete")

In [None]:
df = pd.DataFrame({
    'id':tweet_ids_data,
    'author_id': author_id_data,
    'timestamp':timestamp_data,
    'text':text_data,
    'search_term':search_term_data,
    'location':location_data,
    'coordinates':coordinates_data,
    'sentiment' : sentiment_data,
    'gcc' : gcc_data 
})


df['topic'] = df['search_term'].map(
topic_map
)

df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%dT%H:%M:%S.000Z")
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['week'] = df['timestamp'].dt.isocalendar().week

df['state'] = df['location'].map(
{'Victoria, Australia': 'vic',
'New South Wales, Australia': 'nsw',
'Western Australia, Australia': 'wa',
 'South Australia, Australia': 'sa',
 'Northern Territory, Australia': 'nt',
 'Tasmania, Australia': 'tas',
 'Australian Capital Territory, Australia': 'act',
 'Queensland, Australia': 'qld',
 'Victoria, Australia': 'vic',
})

df['state'] = np.where(df.state.isna(), df['gcc'].map(
{
    '1gsyd':'nsw',
    '1rnsw': 'nsw',
    '2gmel':'vic',
    '2rvic': 'vic',
    '3gbri': 'qld',
    '3rqld': 'qld',
    '4gade': 'sa',
    '4rsau': 'sa',
    '5gper': 'wa',
    '5rwau': 'wa',
    '6ghob': 'tas',
    '6rtas': 'tas',
    '7gdar': 'nt',
    '7rnte': 'nt',
    '8acte': 'act'
}), df.state)

In [None]:
df = df.sample(frac=1)
df = df.drop_duplicates(subset='id', keep="first")

In [None]:
#save as json
df.to_json(r'twitter.json', orient='records')
with open('twitter.json', 'r', encoding="utf8") as f:
    data = json.load(f)
data_dict = {}
data_dict['docs'] = data
with open('twitter.json', 'w', encoding="utf8") as f:
    json.dump(data_dict, f)