In [None]:
"""
This file is to clean and conduct exploratory data analysis on 
 a US Customs Imports Containers from the year 2018 data set called "container"

Dataset includes information on US Customs Imports Containers
such as weight, size, container number etc. 
"""

In [1]:
# importing modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# importing the data's filepath
file_path_containers_0 = 'C:/Users/Public/project_1/bronze layer/containers/ams__container_2018__202001290000_part_0.csv'
file_path_containers_1 = 'C:/Users/Public/project_1/bronze layer/containers/ams__container_2018__202001290000_part_1.csv'
file_path_containers_2 = 'C:/Users/Public/project_1/bronze layer/containers/ams__container_2018__202001290000_part_2.csv'
file_path_containers_3 = 'C:/Users/Public/project_1/bronze layer/containers/ams__container_2018__202001290000_part_3.csv'

# creating the dataframes
df_containers_0 = pd.read_csv(file_path_containers_0)
df_containers_1 = pd.read_csv(file_path_containers_1)
df_containers_2 = pd.read_csv(file_path_containers_2)
df_containers_3 = pd.read_csv(file_path_containers_3)

# concatenating the dataframes into one dataframe
containers_2018 = pd.concat([df_containers_0, df_containers_1, 
                            df_containers_2, df_containers_3], axis=0)


In [3]:
# creating a log changes file 
log_file_path = 'C:/Users/Public/project_1/log_file.txt'
now = pd.Timestamp.now()

# creating a log changes function
def log_changes(message):
    """ 
    Updates a log file with the "message" that corresponds to the changes made
    with the time stamp that it was ran 
    """

    with open(log_file_path, 'a') as f:
        message = now.strftime('%Y-%m-%d %H:%M:%S') +': ' + message + ' on the containers table \n'
        f.write(message)

In [4]:
# looking at the data types
containers_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31512386 entries, 0 to 1512385
Data columns (total 11 columns):
 #   Column                      Dtype 
---  ------                      ----- 
 0   identifier                  int64 
 1   container_number            object
 2   seal_number_1               object
 3   seal_number_2               object
 4   equipment_description_code  object
 5   container_length            int64 
 6   container_height            int64 
 7   container_width             int64 
 8   container_type              object
 9   load_status                 object
 10  type_of_service             object
dtypes: int64(4), object(7)
memory usage: 2.8+ GB


In [5]:
# counting the null values
containers_2018.isnull().sum()

identifier                           0
container_number                   859
seal_number_1                  1101818
seal_number_2                 28905377
equipment_description_code     2351773
container_length                     0
container_height                     0
container_width                      0
container_type                 3741397
load_status                      55837
type_of_service                3501030
dtype: int64

In [6]:
# getting a closer look at the fields
containers_2018.head()

Unnamed: 0,identifier,container_number,seal_number_1,seal_number_2,equipment_description_code,container_length,container_height,container_width,container_type,load_status,type_of_service
0,201801010,FCIU9250931,EMCCES9186,,Container,4000,906,800,,Loaded,Container Station
1,201801011,EITU1595313,EMCCES9076,,Container,4000,906,802,4EB0,Loaded,Container Yard
2,201801012,FCIU9250931,EMCCES9186,,Container,4000,906,800,,Loaded,Container Station
3,201801013,BMOU5389685,EMCCES8776,,Container,0,0,0,,Loaded,
4,201801014,EMCU5289450,EMCCES8446,,Container,4000,900,800,45R1,Loaded,Container Yard


In [7]:
# evaluating fields with the most nullls 
print(containers_2018['seal_number_1'].unique())
print(containers_2018['seal_number_2'].unique())
print(containers_2018['type_of_service'].unique())

['EMCCES9186' 'EMCCES9076' 'EMCCES8776' ... 'AS60179845' '2282279'
 'G5176944']
[nan '989593' 'F8865679' ... 'MX0545113' 'MX0545118' 'MX0533852']
['Container Station' 'Container Yard' nan 'Pier to Pier' 'House to House'
 'Pier to House' 'Break Bulk' 'House to Pier' 'Non Containerized'
 'Headload or Devanning' 'Roll on Roll Off' 'Mixed Delivery']


In [None]:
# checking if description code is usable 
containers_2018['equipment_description_code'].unique()

In [4]:
# dropping the columns that would not be useful for my use case 
# and logging the change
containers_2018.drop(['equipment_description_code','seal_number_1', 'seal_number_2', 'load_status', 
                        'type_of_service'], axis = 1 , inplace = True)
log_changes('dropping columns that are not useful')

In [10]:
# check if the dropped columns worked
containers_2018.head()

Unnamed: 0,identifier,container_number,container_length,container_height,container_width,container_type,load_status
0,201801010,FCIU9250931,4000,906,800,,Loaded
1,201801011,EITU1595313,4000,906,802,4EB0,Loaded
2,201801012,FCIU9250931,4000,906,800,,Loaded
3,201801013,BMOU5389685,0,0,0,,Loaded
4,201801014,EMCU5289450,4000,900,800,45R1,Loaded


In [11]:
# getting the a statistical summary
containers_2018.describe()

Unnamed: 0,identifier,container_length,container_height,container_width
count,31512390.0,31512390.0,31512390.0,31512390.0
mean,1758320000000.0,3164.939,766.2627,709.3328
std,841757400000.0,1378.522,278.3135,254.2615
min,201801000.0,0.0,0.0,0.0
25%,2018022000000.0,2000.0,806.0,800.0
50%,2018061000000.0,4000.0,900.0,800.0
75%,2018092000000.0,4000.0,900.0,800.0
max,20180100000000.0,8600.0,2400.0,1306.0


In [12]:
# looking how many of them are zero since the minimum displays zero
print(len(containers_2018[containers_2018['container_width'] == 0 ]))
print(len(containers_2018[containers_2018['container_length'] == 0 ]))
print(len(containers_2018[containers_2018['container_height'] == 0 ]))

3587725
3497132
3587725


In [13]:
# comparing to how many null values after previous changes
containers_2018.isnull().sum()

identifier                0
container_number        859
container_length          0
container_height          0
container_width           0
container_type      3741397
load_status           55837
dtype: int64

In [None]:
# noticing that a lot of the null values include zero values for container dimensions
containers_2018[containers_2018['container_type'].isnull()]

In [5]:
# dropping the null values from container types since they are not useful
container_type_null_index = containers_2018[containers_2018['container_type'].isnull()].index
containers_2018.drop(container_type_null_index, inplace = True)
log_changes('Dropping empty container type values')

In [6]:
# noticing there are duplicates pertaining to the NC container 
containers_2018[containers_2018['container_number']== 'NC']

Unnamed: 0,identifier,container_number,container_length,container_height,container_width,container_type
139138,2018010325912,NC,4000,900,800,45R1
214743,2018010376850,NC,2000,806,800,22P1
225403,2018010384030,NC,2000,806,800,22P1
264009,20180103109140,NC,4000,900,800,4500
264192,20180103109178,NC,4000,900,800,4500
...,...,...,...,...,...,...
1365261,2018122993799,NC,500,900,800,5599
1365262,2018122993800,NC,500,900,800,5599
1365263,2018122993801,NC,500,900,800,5599
1365264,2018122993802,NC,500,900,800,5599


In [13]:
# dropping duplicates that share NC container and logging this change
NC_index = containers_2018[containers_2018['container_number']== 'NC'].index
containers_2018.drop(NC_index[1:], inplace= True)
log_changes('Dropping duplicates from the container number column')

In [15]:
# dropping duplicates with NC NC1 containers
NC1_index = containers_2018[containers_2018['container_number']== 'NC NC1'].index
containers_2018.drop(NC1_index[1:], inplace= True)
log_changes('Dropping duplicates from the container number column')

In [16]:
# dropping duplicates with loose cargo since this refers to cargo that didn't hold any goods
loose_index = containers_2018[containers_2018['container_number']== 'LOOSE CARGO'].index
containers_2018.drop(loose_index[1:], inplace= True)
log_changes('Dropping duplicates from the container number column')

In [7]:
# again checking for nulls now that i dropped duplicates
containers_2018.isnull().sum()

identifier          0
container_number    1
container_length    0
container_height    0
container_width     0
container_type      0
dtype: int64

In [17]:
# dropping the one null value thats left 
container_number_null_index = containers_2018[containers_2018['container_number'].isnull()].index
containers_2018.drop(container_number_null_index, inplace= True)
log_changes('Dropping empty container number values')

In [18]:
# since the identifier is a primary index i am dropping duplicates on that
containers_2018.drop_duplicates(subset = ['identifier'], inplace = True)
log_changes('Dropping duplicates')

In [10]:
# verifying there are no more null values
containers_2018.isnull().sum()

identifier          0
container_number    0
container_length    0
container_height    0
container_width     0
container_type      0
dtype: int64

In [19]:
# exporting the cleaned dataframe
containers_2018.to_csv('containers_cleaned.csv', index= False, sep= '|')

# saving the changes into my log file 
log_changes('exporting cleaned container data to a csv file')