## **Scraping data about mass shootings in the US from 01-01-2023 to 03-12-2023**

## Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import seaborn as sns
from selenium.webdriver.support import expected_conditions as EC

## Scraping code

In [67]:
s = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=s)

scraped_data = []

for page in range(0, 25):
    page_url = f"https://www.gunviolencearchive.org/reports/mass-shooting?page={page}"
    driver.get(page_url)
    print(f'Getting data from page {page}. Awaiting...')
    driver.implicitly_wait(5)


    data_rows= driver.find_elements(By.XPATH, '//table[@class="responsive sticky-enabled tableheader-processed sticky-table"]//tbody//tr')
    for data_row in data_rows:
        incident_id = data_row.find_element(By.XPATH, './/td[1]').text.strip()
        incident_date = data_row.find_element(By.XPATH, './/td[2]').text.strip()
        state = data_row.find_element(By.XPATH, './/td[3]').text.strip()
        city_or_country = data_row.find_element(By.XPATH, './/td[4]').text.strip()
        address = data_row.find_element(By.XPATH, './/td[5]').text.strip()
        victims_killed = data_row.find_element(By.XPATH, './/td[6]').text.strip()
        victims_injured = data_row.find_element(By.XPATH, './/td[7]').text.strip()
        suspects_killed = data_row.find_element(By.XPATH, './/td[8]').text.strip()
        suspects_injured = data_row.find_element(By.XPATH, './/td[9]').text.strip()
        suspects_arrested = data_row.find_element(By.XPATH, './/td[10]').text.strip()

        scraped_data.append({
            "incident_id": incident_id,
            "incident_date": incident_date,
            "state": state,
            "city_or_country": city_or_country,
            "address": address,
            "victims_killed": victims_killed,
            "victims_injured": victims_injured,
            "suspects_killed": suspects_killed,
            "suspects_injured": suspects_injured,
            "suspects_arrested": suspects_arrested
        })

    time.sleep(1)


driver.quit()

Getting data from page 0. Awaiting...
Getting data from page 1. Awaiting...
Getting data from page 2. Awaiting...
Getting data from page 3. Awaiting...
Getting data from page 4. Awaiting...
Getting data from page 5. Awaiting...
Getting data from page 6. Awaiting...
Getting data from page 7. Awaiting...
Getting data from page 8. Awaiting...
Getting data from page 9. Awaiting...
Getting data from page 10. Awaiting...
Getting data from page 11. Awaiting...
Getting data from page 12. Awaiting...
Getting data from page 13. Awaiting...
Getting data from page 14. Awaiting...
Getting data from page 15. Awaiting...
Getting data from page 16. Awaiting...
Getting data from page 17. Awaiting...
Getting data from page 18. Awaiting...
Getting data from page 19. Awaiting...
Getting data from page 20. Awaiting...
Getting data from page 21. Awaiting...
Getting data from page 22. Awaiting...
Getting data from page 23. Awaiting...
Getting data from page 24. Awaiting...


## Cleaning the data

In [68]:
df = pd.DataFrame(scraped_data)

In [69]:
df = df.loc[:,~df.columns.isin(["incident_id"])]

In [70]:
print(df.dtypes)
print('\n')
print(f'shape: {df.shape}')

incident_date        object
state                object
city_or_country      object
address              object
victims_killed       object
victims_injured      object
suspects_killed      object
suspects_injured     object
suspects_arrested    object
dtype: object


shape: (621, 9)


In [71]:
df.head()

Unnamed: 0,incident_date,state,city_or_country,address,victims_killed,victims_injured,suspects_killed,suspects_injured,suspects_arrested
0,"December 1, 2023",Nevada,Las Vegas,E Charleston Blvd and N Honolulu St,1,4,0,0,0
1,"November 29, 2023",Illinois,Chicago,3600 block of West Flournoy St,1,3,0,0,0
2,"November 29, 2023",North Carolina,Lexington,7000 block of NC-8,0,5,0,0,0
3,"November 26, 2023",North Carolina,Winston Salem (Winston-salem),3533 N Glenn Ave,0,4,0,1,1
4,"November 25, 2023",California,Fontana,15500 block of Eastwind Ave,3,1,0,0,1


In [72]:
df = df.apply(pd.to_numeric,errors='ignore')

In [73]:
df.dtypes

incident_date        object
state                object
city_or_country      object
address              object
victims_killed        int64
victims_injured       int64
suspects_killed       int64
suspects_injured      int64
suspects_arrested     int64
dtype: object

In [74]:
df['incident_date'] = pd.to_datetime(df['incident_date'])

In [76]:
df.head()

Unnamed: 0,incident_date,state,city_or_country,address,victims_killed,victims_injured,suspects_killed,suspects_injured,suspects_arrested
0,2023-12-01,Nevada,Las Vegas,E Charleston Blvd and N Honolulu St,1,4,0,0,0
1,2023-11-29,Illinois,Chicago,3600 block of West Flournoy St,1,3,0,0,0
2,2023-11-29,North Carolina,Lexington,7000 block of NC-8,0,5,0,0,0
3,2023-11-26,North Carolina,Winston Salem (Winston-salem),3533 N Glenn Ave,0,4,0,1,1
4,2023-11-25,California,Fontana,15500 block of Eastwind Ave,3,1,0,0,1


In [77]:
df.to_csv('mass_shootings_usa.csv')