# Data Cleaning

## load libraries

In [1]:
import pandas as pd
import numpy as np
from janitor import clean_names

## load raw data

In [12]:
raw_regional_tourism = pd.read_csv('raw_data/regional_domestic_tourism.csv').clean_names()
raw_accomodation_occupancy = pd.read_csv('raw_data/scottish_accomodation_occupancy.csv').clean_names()
raw_activities = pd.read_csv('raw_data/tourism_day_visits_activities.csv').clean_names()
raw_demographics = pd.read_csv('raw_data/tourism_day_visits_demographics.csv').clean_names()
raw_location = pd.read_csv('raw_data/tourism_day_visits_location.csv').clean_names()
raw_transport = pd.read_csv('raw_data/tourism_day_visits_transport.csv').clean_names()
raw_international = pd.read_csv('raw_data/international-passenger-survey-scotland-2019.csv', encoding = 'unicode_escape').clean_names()

raw_council_data = pd.read_csv('raw_data/council_data.csv').clean_names()

## Cleaning data

### Council data (used to get region names)

In [13]:
raw_council_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ca                44 non-null     object 
 1   caname            44 non-null     object 
 2   cadateenacted     44 non-null     int64  
 3   cadatearchived    4 non-null      float64
 4   hscp              44 non-null     object 
 5   hscpname          44 non-null     object 
 6   hscpdateenacted   44 non-null     int64  
 7   hscpdatearchived  4 non-null      float64
 8   hb                44 non-null     object 
 9   hbname            44 non-null     object 
 10  hbdateenacted     44 non-null     int64  
 11  hbdatearchived    12 non-null     float64
 12  country           44 non-null     object 
dtypes: float64(3), int64(3), object(7)
memory usage: 4.6+ KB


In [21]:
# select relevent columns
council_data = raw_council_data[['ca','caname']].copy()

# remove duplicates
council_data = council_data.drop_duplicates(keep=False)
council_data

Unnamed: 0,ca,caname
0,S12000005,Clackmannanshire
1,S12000006,Dumfries and Galloway
2,S12000008,East Ayrshire
3,S12000010,East Lothian
6,S12000013,Na h-Eileanan Siar
7,S12000014,Falkirk
8,S12000015,Fife
9,S12000017,Highland
12,S12000019,Midlothian
13,S12000020,Moray


In [22]:
# rename columns
council_data.rename(columns={'ca':'featurecode', 'caname':'region_name'}, inplace=True)
council_data

Unnamed: 0,featurecode,region_name
0,S12000005,Clackmannanshire
1,S12000006,Dumfries and Galloway
2,S12000008,East Ayrshire
3,S12000010,East Lothian
6,S12000013,Na h-Eileanan Siar
7,S12000014,Falkirk
8,S12000015,Fife
9,S12000017,Highland
12,S12000019,Midlothian
13,S12000020,Moray


### Regional data

In [6]:
raw_regional_tourism.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2673 entries, 0 to 2672
Data columns (total 7 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   featurecode                    2673 non-null   object
 1   datecode                       2673 non-null   object
 2   measurement                    2673 non-null   object
 3   units                          2673 non-null   object
 4   value                          2673 non-null   int64 
 5   region_of_residence            2673 non-null   object
 6   breakdown_of_domestic_tourism  2673 non-null   object
dtypes: int64(1), object(6)
memory usage: 146.3+ KB


In [7]:
raw_regional_tourism.isnull().any()

featurecode                      False
datecode                         False
measurement                      False
units                            False
value                            False
region_of_residence              False
breakdown_of_domestic_tourism    False
dtype: bool

In [9]:
raw_regional_tourism.describe(include='all')

Unnamed: 0,featurecode,datecode,measurement,units,value,region_of_residence,breakdown_of_domestic_tourism
count,2673,2673,2673,2673,2673.0,2673,2673
unique,33,9,1,3,,3,3
top,S12000019,2011-2013,Count,Thousand Visits,,Scotland,Nights
freq,81,297,2673,891,,891,891
mean,,,,,761.897493,,
std,,,,,3147.268188,,
min,,,,,0.0,,
25%,,,,,28.0,,
50%,,,,,106.0,,
75%,,,,,433.0,,


In [10]:
raw_regional_tourism.head(10)

Unnamed: 0,featurecode,datecode,measurement,units,value,region_of_residence,breakdown_of_domestic_tourism
0,S12000039,2016-2018,Count,million pounds (GBP),8,England,Expenditure
1,S12000039,2015-2017,Count,Thousand Nights,140,All of GB,Nights
2,S12000039,2015-2017,Count,million pounds (GBP),8,England,Expenditure
3,S12000039,2017-2019,Count,Thousand Nights,76,England,Nights
4,S12000039,2009-2011,Count,Thousand Visits,68,Scotland,Visits
5,S12000039,2016-2018,Count,Thousand Nights,59,Scotland,Nights
6,S12000039,2016-2018,Count,million pounds (GBP),4,Scotland,Expenditure
7,S12000039,2013-2015,Count,Thousand Visits,32,Scotland,Visits
8,S12000039,2012-2014,Count,million pounds (GBP),6,England,Expenditure
9,S12000039,2015-2017,Count,Thousand Visits,28,England,Visits


In [11]:
## featurecodes ~ region/region_name?
raw_regional_tourism.featurecode.unique()

array(['S12000039', 'S12000040', 'S92000003', 'S12000005', 'S12000006',
       'S12000008', 'S12000010', 'S12000011', 'S12000013', 'S12000014',
       'S12000017', 'S12000018', 'S12000019', 'S12000020', 'S12000021',
       'S12000023', 'S12000026', 'S12000027', 'S12000028', 'S12000029',
       'S12000030', 'S12000033', 'S12000034', 'S12000035', 'S12000036',
       'S12000038', 'S12000041', 'S12000042', 'S12000045', 'S12000047',
       'S12000048', 'S12000049', 'S12000050'], dtype=object)

In [26]:
# merge council data to get featurecode name
raw_regional_tourism_updated = pd.merge(raw_regional_tourism, council_data, how='inner')
raw_regional_tourism_updated

Unnamed: 0,featurecode,datecode,measurement,units,value,region_of_residence,breakdown_of_domestic_tourism,region_name
0,S12000040,2016-2018,Count,Thousand Nights,197,All of GB,Nights,West Lothian
1,S12000040,2015-2017,Count,Thousand Nights,115,England,Nights,West Lothian
2,S12000040,2015-2017,Count,Thousand Visits,48,England,Visits,West Lothian
3,S12000040,2015-2017,Count,million pounds (GBP),2,Scotland,Expenditure,West Lothian
4,S12000040,2013-2015,Count,Thousand Visits,135,All of GB,Visits,West Lothian
...,...,...,...,...,...,...,...,...
1939,S12000050,2012-2014,Count,million pounds (GBP),18,England,Expenditure,North Lanarkshire
1940,S12000050,2009-2011,Count,Thousand Visits,113,England,Visits,North Lanarkshire
1941,S12000050,2009-2011,Count,Thousand Nights,640,All of GB,Nights,North Lanarkshire
1942,S12000050,2015-2017,Count,million pounds (GBP),12,England,Expenditure,North Lanarkshire


In [27]:
## check columns
raw_regional_tourism_updated.units.unique()

array(['Thousand Nights', 'Thousand Visits', 'million pounds (GBP)'],
      dtype=object)

In [28]:
raw_regional_tourism_updated.measurement.unique()

array(['Count'], dtype=object)

- Drop unwanted column `measurement`
- Rename `datecode` to `years`

In [32]:
# drop unwated columns / values
clean_regional_tourism = raw_regional_tourism_updated.copy()

clean_regional_tourism.drop(columns='measurement', inplace=True)
clean_regional_tourism.rename(columns={'datecode':'years'}, inplace=True)

clean_regional_tourism

Unnamed: 0,featurecode,years,units,value,region_of_residence,breakdown_of_domestic_tourism,region_name
0,S12000040,2016-2018,Thousand Nights,197,All of GB,Nights,West Lothian
1,S12000040,2015-2017,Thousand Nights,115,England,Nights,West Lothian
2,S12000040,2015-2017,Thousand Visits,48,England,Visits,West Lothian
3,S12000040,2015-2017,million pounds (GBP),2,Scotland,Expenditure,West Lothian
4,S12000040,2013-2015,Thousand Visits,135,All of GB,Visits,West Lothian
...,...,...,...,...,...,...,...
1939,S12000050,2012-2014,million pounds (GBP),18,England,Expenditure,North Lanarkshire
1940,S12000050,2009-2011,Thousand Visits,113,England,Visits,North Lanarkshire
1941,S12000050,2009-2011,Thousand Nights,640,All of GB,Nights,North Lanarkshire
1942,S12000050,2015-2017,million pounds (GBP),12,England,Expenditure,North Lanarkshire


### Occupancy data

In [33]:
raw_accomodation_occupancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 711 entries, 0 to 710
Data columns (total 9 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   featurecode                       711 non-null    object 
 1   datecode                          711 non-null    int64  
 2   measurement                       711 non-null    object 
 3   units                             711 non-null    object 
 4   value                             711 non-null    float64
 5   accommodation_type_and_occupancy  711 non-null    object 
 6   weekday_weekend                   711 non-null    object 
 7   size_of_accommodation             711 non-null    object 
 8   location                          711 non-null    object 
dtypes: float64(1), int64(1), object(7)
memory usage: 50.1+ KB


In [34]:
raw_accomodation_occupancy.isnull().any()

featurecode                         False
datecode                            False
measurement                         False
units                               False
value                               False
accommodation_type_and_occupancy    False
weekday_weekend                     False
size_of_accommodation               False
location                            False
dtype: bool

In [35]:
raw_accomodation_occupancy.head(10)

Unnamed: 0,featurecode,datecode,measurement,units,value,accommodation_type_and_occupancy,weekday_weekend,size_of_accommodation,location
0,S92000003,2012,Percent,Percentage,52.32,Guest House/B&B - Room Occupancy,All,All,Accessible Small Towns
1,S92000003,2015,Percent,Percentage,49.18,Guest House/B&B - Room Occupancy,All,All,Large Urban Areas
2,S92000003,2018,Percent,Percentage,63.57,Guest House/B&B - Room Occupancy,All,All,Large Urban Areas
3,S92000003,2013,Percent,Percentage,53.35,Guest House/B&B - Room Occupancy,All,All,Accessible Rural
4,S92000003,2018,Percent,Percentage,40.45,Guest House/B&B - Room Occupancy,All,All,Accessible Rural
5,S92000003,2019,Percent,Percentage,43.93,Guest House/B&B - Room Occupancy,All,All,Accessible Small Towns
6,S92000003,2016,Percent,Percentage,38.76,Guest House/B&B - Room Occupancy,All,Rooms: 01-03,All
7,S92000003,2017,Percent,Percentage,60.24,Guest House/B&B - Room Occupancy,All,All,Other Urban Areas
8,S92000003,2016,Percent,Percentage,45.44,Guest House/B&B - Room Occupancy,All,All,Other Urban Areas
9,S92000003,2015,Percent,Percentage,46.54,Guest House/B&B - Room Occupancy,All,All,Remote Rural
