# Code to Clean the Raw Instagram Dataset

## Import Packages and Files

In [1]:
#import necessary packages
import pandas as pd

In [2]:
#import raw crime dataset and display
file_location = 'C:/Users/pkhu624/OneDrive - The University of Auckland/399/Insta+FB/Raw Instagram Data.csv'
Insta_df = pd.read_csv(file_location)

#get the location from the crime dataset
location_df = pd.read_csv('C:/Users/pkhu624/OneDrive - The University of Auckland/399/Crime Data/Crime_Data.csv')
display(Insta_df)

#Keyword Extraction file
Keyword_Extraction_Location = 'C:/Users/pkhu624/OneDrive - The University of Auckland/399/Insta+FB/Keywords Extraction Analysis.csv'
extraction_df = pd.read_csv(Keyword_Extraction_Location)

Unnamed: 0,alt,caption,childPosts/0/alt,childPosts/0/caption,childPosts/0/commentsCount,childPosts/0/dimensionsHeight,childPosts/0/dimensionsWidth,childPosts/0/displayUrl,childPosts/0/firstComment,childPosts/0/id,...,taggedUsers/11/is_verified,taggedUsers/11/profile_pic_url,taggedUsers/11/username,timestamp,type,url,videoDuration,videoPlayCount,videoUrl,videoViewCount
0,,Allow us to introduce you to #PaddyGowerHasIss...,,,0.0,1350.0,1080.0,https://scontent-bos5-1.cdninstagram.com/v/t51...,,3.107920e+18,...,,,,2023-05-21T23:49:40.000Z,Sidecar,https://www.instagram.com/p/CshjPSVPsDK/,,,,
1,,🌀There remains a lot of uncertainty about whet...,,,0.0,774.0,1080.0,https://instagram.fpsr2-1.fna.fbcdn.net/v/t51....,,3.102085e+18,...,,,,2023-05-13T22:35:53.000Z,Sidecar,https://www.instagram.com/p/CsM0b6QSNgf/,,,,
2,,Rugby coach Aaron Mauger is just looking forwa...,,,,,,,,,...,,,,2023-05-22T07:04:17.000Z,Video,https://www.instagram.com/p/CsiU84lhe61/,48.469,5758.0,https://instagram.fcgh14-1.fna.fbcdn.net/v/t66...,2715.0
3,,🚨⚡️A cluster of thunderstorms moving towards #...,,,0.0,691.0,1080.0,https://scontent-lax3-1.cdninstagram.com/v/t51...,,3.099495e+18,...,,,,2023-05-10T08:50:56.000Z,Sidecar,https://www.instagram.com/p/CsDnpFlrcCM/,,,,
4,,⚡️⚡️Scattered thunderstorms moving onshore ton...,,,,,,,,,...,,,,2023-05-10T07:20:30.000Z,Image,https://www.instagram.com/p/CsDdSxYrCEO/,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,,"Former Warriors player Manu Vatuvei, who was j...",,,,,,,,,...,,,,2023-05-17T04:37:46.000Z,Image,https://www.instagram.com/p/CsVMNTaM-Mu/,,,,
276,Lest we forget.,"Today, we remember and reflect. #anzacday2023",,,,,,,,,...,,,,2023-04-24T17:50:10.000Z,Image,https://www.instagram.com/p/CrbYoi-tOyb/,,,,
277,,"👋 As we part for the holidays, we're sharing w...",,,,,,,,,...,,,,2022-12-21T23:00:29.000Z,Video,https://www.instagram.com/p/CmcpjNrLZoa/,52.395,2562.0,https://instagram.famm6-1.fna.fbcdn.net/o1/v/t...,623.0
278,,"If summer is hot tap and winter is cold tap, t...",,,0.0,896.0,1080.0,https://instagram.fdnk3-2.fna.fbcdn.net/v/t51....,,3.087591e+18,...,,,,2023-04-23T22:39:55.000Z,Sidecar,https://www.instagram.com/p/CrZU_3PSWWr/,,,,


In [3]:
#Only keep necessary columns and drop the rest
Insta_df = Insta_df.drop(Insta_df.columns.difference(['caption', 'hashtags/0', 'hashtags/1', 'hashtags/2', 'hashtags/3', 'likesCount', 'timestamp', 'videoViewCount', 'commentsCount']), axis=1)

## Specify Functions used to process Dataframes

In [4]:
#Function to only keep rows of dataframe that contains a location
def recognise_text(targetwords, dataframe, columns):
    #create a new dataframe instane
    filtered_dataframe = pd.DataFrame()
    
    for column in columns:
        # Use boolean indexing to filter rows containing any of the locations
        mask = dataframe[column].str.contains('|'.join(targetwords), case=False, na=False)
        
        #append the row that contains location to the new dataframe instance
        filtered_dataframe = filtered_dataframe.append(dataframe[mask])
    
    return filtered_dataframe

In [5]:
#function to detect and add which Area the row in the dataframe refers to
def add_area_unit_column(locations, dataframe, columns):
    #add an empty Area Unit column to dataframe
    dataframe['Area Unit'] = ''

    for column in columns:
        for location in locations:
            # Use boolean indexing to find rows that contain the location
            mask = dataframe[column].str.contains(location, case=False, na=False)
            dataframe.loc[mask, 'Area Unit'] = location
    
    return dataframe

In [6]:
#function to map the Area of each row to a longitude nad latitude
def map_coordinates(df1, df2, area_unit_col):
    for index, row in df1.iterrows():
        extracted_location = row[area_unit_col]

        # Find the matching row in the df2 DataFrame
        matching_row = df2[df2[area_unit_col] == extracted_location]

        # Check if a matching row was found
        if not matching_row.empty:
            longitude = matching_row['Longitude'].values[0]
            latitude = matching_row['Latitude'].values[0]

            # Add the Longitude and Latitude values to the df1 DataFrame
            df1.at[index, 'Longitude'] = longitude
            df1.at[index, 'Latitude'] = latitude

## Use Functions to Filter Locations

In [7]:
#arguments to be passed into functions
locations_to_search = location_df['Area Unit'].unique()
columns_to_search = ['caption', 'hashtags/0', 'hashtags/1', 'hashtags/2', 'hashtags/3']

In [8]:
#use functions to create Insta_Locations dataframe with location that was detected
Location_rows = recognise_text(locations_to_search, Insta_df, columns_to_search)
Insta_Location = add_area_unit_column(locations_to_search, Location_rows, columns_to_search)
map_coordinates(Insta_Location, location_df, 'Area Unit')

## Use Functions to Filter Incident Type

In [9]:
#arguments to be passed into functions
columns_to_search = ['caption', 'hashtags/0', 'hashtags/1', 'hashtags/2', 'hashtags/3']
#change strings to lower 
Insta_Location['caption'] = Insta_Location['caption'].str.lower()

#incident types
fire = (extraction_df.iloc[:,0].dropna()).tolist()
flood = (extraction_df.iloc[:,1].dropna()).tolist()
storm = (extraction_df.iloc[:,2].dropna()).tolist()
landslide = (extraction_df.iloc[:,3].dropna()).tolist()
harass = (extraction_df.iloc[:,4].dropna()).tolist()
battery = (extraction_df.iloc[:,5].dropna()).tolist()
theft = (extraction_df.iloc[:,6].dropna()).tolist()
burglary = (extraction_df.iloc[:,7].dropna()).tolist()
robbery = (extraction_df.iloc[:,8].dropna()).tolist()
sexual_assault = (extraction_df.iloc[:,9].dropna()).tolist()

In [10]:
# find the rows that contain a recognised incident
filtered_fire = recognise_text(fire, Insta_Location, columns_to_search)
filtered_flood = recognise_text(flood, Insta_Location, columns_to_search)
filtered_storm = recognise_text(storm, Insta_Location, columns_to_search)
filtered_landslide = recognise_text(landslide, Insta_Location, columns_to_search)
filtered_harass = recognise_text(harass, Insta_Location, columns_to_search)
filtered_battery = recognise_text(battery, Insta_Location, columns_to_search)
filtered_theft = recognise_text(theft, Insta_Location, columns_to_search)
filtered_burglary =recognise_text(burglary, Insta_Location, columns_to_search)
filtered_robbery =recognise_text(robbery, Insta_Location, columns_to_search)
filtered_sexual_assault = recognise_text(sexual_assault, Insta_Location, columns_to_search)

#Add incident type to the rows 
filtered_fire['Incident'] = 'Fire'
filtered_flood['Incident'] = 'Flood'
filtered_storm['Incident'] = 'Storm'
filtered_landslide['Incident'] = 'Landslide'
filtered_harass['Incident'] = 'Harassment'
filtered_battery['Incident'] = 'Battery'
filtered_theft['Incident'] = 'Theft'
filtered_burglary['Incident'] = 'Burglary'
filtered_robbery['Incident'] = 'Robbery'
filtered_sexual_assault['Incident'] = 'Sexual Assault'


#concatonate rows into one combined dataframe
filtered_df = pd.concat([filtered_fire, filtered_landslide, filtered_flood, filtered_storm, filtered_harass, filtered_battery, filtered_sexual_assault, filtered_theft, filtered_burglary, filtered_robbery])


## Formatting Dataset

In [11]:
#Only keep the unique posts in the dataframe
unique_count = filtered_df['caption'].nunique()
filtered_df = filtered_df.drop_duplicates(subset='caption')

# Create date and time
filtered_df['timestamp'] = pd.to_datetime(filtered_df['timestamp'])
filtered_df['Date'] = filtered_df['timestamp'].dt.date
filtered_df['Hour'] = filtered_df['timestamp'].dt.hour
filtered_df['Hour'] = pd.to_datetime(filtered_df['Hour'], format='%H').dt.strftime('%I %p')
filtered_df['Occurrence Day Of Week'] = pd.to_datetime(filtered_df['Date']).dt.day_name().str[:3]

#Change type of comments/forwards/likes to int and make NA to 0
filtered_df['likesCount'] = filtered_df['likesCount'].fillna(0).astype(int)
filtered_df['videoViewCount'] = filtered_df['videoViewCount'].fillna(0).astype(int)
filtered_df['commentsCount'] = filtered_df['commentsCount'].fillna(0).astype(int)

#drop unnecessary columns
cols_to_drop = ['caption', 'timestamp', 'hashtags/0', 'hashtags/1', 'hashtags/2', 'hashtags/3', 'commentsCount', 'likesCount', 'videoViewCount']   
filtered_df.drop(cols_to_drop, axis=1, inplace=True, )

## Export Clean Dataset to a file

In [12]:
filtered_df.to_csv('Clean Instagram Data.csv', index=False)
display(filtered_df)

Unnamed: 0,Area Unit,Longitude,Latitude,Incident,Date,Hour,Occurrence Day Of Week
111,Newmarket,174.774419,-36.868019,Fire,2023-04-25,06 AM,Tue
134,Newton,174.758496,-36.858819,Fire,2023-05-15,09 PM,Mon
240,St Lukes,174.734919,-36.876196,Fire,2023-05-17,02 AM,Wed
274,Freemans Bay,174.75403,-36.852258,Fire,2023-05-17,03 AM,Wed
116,Waima,174.64088,-36.939234,Flood,2023-05-19,03 AM,Fri
6,Hunua,174.965634,-37.149413,Storm,2023-05-09,02 AM,Tue
28,Warkworth,174.659543,-36.405593,Storm,2023-05-09,04 AM,Tue
144,Chelsea,174.710926,-36.812326,Storm,2023-04-18,08 PM,Tue
209,Albany,174.70898,-36.723639,Theft,2023-04-20,03 AM,Thu
224,Albany,174.70898,-36.723639,Theft,2023-04-05,05 AM,Wed
