In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import datetime

In [2]:
# Create DataFrame from CSV file
df = pd.read_csv('/Users/noah/Desktop/Border Crossing Project/CSV/Border_Crossing_Entry_Data.csv')

In [3]:
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Latitude,Longitude,Point
0,International Falls,Minnesota,3604,US-Canada Border,Oct 2023,Trucks,1372,48.608,-93.401,POINT (-93.401355 48.6078)
1,Sumas,Washington,3009,US-Canada Border,Oct 2023,Train Passengers,57,49.002,-122.265,POINT (-122.264805 49.002388)
2,Naco,Arizona,2603,US-Mexico Border,Sep 2023,Trucks,270,31.334,-109.948,POINT (-109.948413 31.334084)
3,Wildhorse,Montana,3323,US-Canada Border,Sep 2023,Trucks,42,48.999,-110.215,POINT (-110.215083 48.999361)
4,Calais,Maine,115,US-Canada Border,Sep 2023,Trains,15,45.189,-67.275,POINT (-67.275381 45.188548)


In [4]:
# Understand how many rows and columns
df.shape

(388823, 10)

In [5]:
# Check missing values
df.isna().sum()

Port Name    0
State        0
Port Code    0
Border       0
Date         0
Measure      0
Value        0
Latitude     1
Longitude    1
Point        1
dtype: int64

After understanding a bit about the data, some cleaning is needed.

In [6]:
# Drop unneeded columns
column_to_drop = ['Point']
df = df.drop(columns = df[column_to_drop])

In [7]:
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Latitude,Longitude
0,International Falls,Minnesota,3604,US-Canada Border,Oct 2023,Trucks,1372,48.608,-93.401
1,Sumas,Washington,3009,US-Canada Border,Oct 2023,Train Passengers,57,49.002,-122.265
2,Naco,Arizona,2603,US-Mexico Border,Sep 2023,Trucks,270,31.334,-109.948
3,Wildhorse,Montana,3323,US-Canada Border,Sep 2023,Trucks,42,48.999,-110.215
4,Calais,Maine,115,US-Canada Border,Sep 2023,Trains,15,45.189,-67.275


In [8]:
# Drop rows where key data is missing
df = df.dropna(subset=['Latitude', 'Longitude'])

In [9]:
# Check datatypes
df.dtypes

Port Name     object
State         object
Port Code      int64
Border        object
Date          object
Measure       object
Value          int64
Latitude     float64
Longitude    float64
dtype: object

In [10]:
# Change date string record to date object
df['Datetime'] = pd.to_datetime(df['Date'])

In [11]:
# Need to remove day value from Date since these are monthly measures
df['Datetime'] = df['Datetime'].dt.to_period('M')

In [12]:
df.head()

Unnamed: 0,Port Name,State,Port Code,Border,Date,Measure,Value,Latitude,Longitude,Datetime
0,International Falls,Minnesota,3604,US-Canada Border,Oct 2023,Trucks,1372,48.608,-93.401,2023-10
1,Sumas,Washington,3009,US-Canada Border,Oct 2023,Train Passengers,57,49.002,-122.265,2023-10
2,Naco,Arizona,2603,US-Mexico Border,Sep 2023,Trucks,270,31.334,-109.948,2023-09
3,Wildhorse,Montana,3323,US-Canada Border,Sep 2023,Trucks,42,48.999,-110.215,2023-09
4,Calais,Maine,115,US-Canada Border,Sep 2023,Trains,15,45.189,-67.275,2023-09


Now that the dataframe is cleaned, it is ready to be saved and used for Exploratory Data Analysis. 

In [13]:
# Save cleaned DataFrame as CSV file
df.to_csv('cleaned_border_data.csv', index=False)