In [1]:
import pandas as pd
import datetime as dt
from matplotlib import pyplot as plt

# Explain the meaning of each FEATURE

- dt : Data calculation time
- dt_iso  
- timezone : time zone
- city_name : city name
- lat :latitude
- lon :Longitude
- temp :temperature
- visibility :visibility
- dew_point :dew point 
- feels_like :This temperature parameter describes the human perception of the weather. Default units. Kelvin, metric. Celsius, imperial. Fahrenheit (Fahrenheit)
- temp_min :minimum_temperature
- temp_max :maximum temperature
- pressure :pressure
- sea_level :Atmospheric pressure at sea level,, hPa
- grnd_level :Atmospheric pressure at ground level, hPa
- humidity :humidity 
- wind_speed :wind speed
- wind_deg :wind direction in degrees (meteorology)
- wind_gust :last update time
- rain_1h :Rainfall for the last 1 hour
- rain_3h :amount of rain in the last 3 hours
- snow_1h :amount of snow in the last 1 hour
- snow_3h :last update time
- clouds_all :% turbidity %
- weather_id :Weather condition identifier
- weather_main :Main weather conditions
- weather_description :Weather description
- weather_icon :Weather icon ID

### Summary of data quality plan:
| Variable Names                     | Data Quality Issue            | Handling Strategy              |
|------------------------------------|-------------------------------|--------------------------------|
| dt_iso  | Presence of data other than 2018 | Delete data other than 2018 |
| dt_iso  | Duplicate time | Delete data other than 2018 |
| dt_iso  | Name not clear enough | Change name to day_hour |
| snow_1h         | high %missing value | no action |
| city_name     | constant column | delete | 
| lat              | constant column | delete |  
| lon   | constant column | delete | 
| sea_level | constant column | delete | 
| grnd_level                                            | constant column | delete | 
| rain_3h                                            | constant column | delete | 
| snow_3h                                            | constant column | delete |  
| rain_1h                                            | %missing higher |delete | 
| snow_1h                                           | %missing higher |delete | 

In [2]:
df= pd.read_csv("historyWeatherDublin2018.csv")

In [3]:
df.head(5)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1483228800,2017-01-01 00:00:00 +0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,1.78,...,,2.3,,,,75,501,Rain,moderate rain,10n
1,1483232400,2017-01-01 01:00:00 +0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,2.28,...,,1.51,,,,75,501,Rain,moderate rain,10n
2,1483236000,2017-01-01 02:00:00 +0000 UTC,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,2.28,...,,0.64,,,,75,500,Rain,light rain,10n
3,1483239600,2017-01-01 03:00:00 +0000 UTC,0,Custom location,53.345035,-6.267261,4.39,9999.0,3.36,0.04,...,,0.17,,,,75,500,Rain,light rain,10n
4,1483243200,2017-01-01 04:00:00 +0000 UTC,0,Custom location,53.345035,-6.267261,4.39,9999.0,2.42,0.04,...,,,,,,75,803,Clouds,broken clouds,04n


In [4]:
df.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'visibility', 'dew_point', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'sea_level', 'grnd_level', 'humidity', 'wind_speed',
       'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
       'clouds_all', 'weather_id', 'weather_main', 'weather_description',
       'weather_icon'],
      dtype='object')

In [5]:
df.shape

(27186, 28)

In [6]:
df.dtypes

dt                       int64
dt_iso                  object
timezone                 int64
city_name               object
lat                    float64
lon                    float64
temp                   float64
visibility             float64
dew_point              float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
sea_level              float64
grnd_level             float64
humidity                 int64
wind_speed             float64
wind_deg                 int64
wind_gust              float64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
weather_icon            object
dtype: object

## Check if there are duplicate rows or columns 

In [7]:
#remove whitespace in or around feature names
df.columns = df.columns.str.replace(' ', '')

#check to ensure whitespaces have been removed
df.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'visibility', 'dew_point', 'feels_like', 'temp_min', 'temp_max',
       'pressure', 'sea_level', 'grnd_level', 'humidity', 'wind_speed',
       'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
       'clouds_all', 'weather_id', 'weather_main', 'weather_description',
       'weather_icon'],
      dtype='object')

Only data for 2018 is required, so delete other times

### Convert time format

In [8]:
df['dt_iso'] = pd.to_datetime(df['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC')
df['dt_iso'] = df['dt_iso'].dt.strftime('%Y-%m-%d %H:%M:%S')
df['dt_iso'] = pd.to_datetime(df['dt_iso'])

In [9]:
df.head(5)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1483228800,2017-01-01 00:00:00,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,1.78,...,,2.3,,,,75,501,Rain,moderate rain,10n
1,1483232400,2017-01-01 01:00:00,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,2.28,...,,1.51,,,,75,501,Rain,moderate rain,10n
2,1483236000,2017-01-01 02:00:00,0,Custom location,53.345035,-6.267261,5.39,9999.0,4.35,2.28,...,,0.64,,,,75,500,Rain,light rain,10n
3,1483239600,2017-01-01 03:00:00,0,Custom location,53.345035,-6.267261,4.39,9999.0,3.36,0.04,...,,0.17,,,,75,500,Rain,light rain,10n
4,1483243200,2017-01-01 04:00:00,0,Custom location,53.345035,-6.267261,4.39,9999.0,2.42,0.04,...,,,,,,75,803,Clouds,broken clouds,04n


## Delete data except for 2018

In [10]:
df.drop(df[~(df['dt_iso'].dt.year == 2018) ].index,inplace = True)

In [11]:
df.shape

(9060, 28)

In [12]:
df.head(5)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
9014,1514764800,2018-01-01 00:00:00,0,Custom location,53.345035,-6.267261,4.39,9999.0,2.26,-1.93,...,18.0,,,,,40,520,Rain,light intensity shower rain,09n
9015,1514768400,2018-01-01 01:00:00,0,Custom location,53.345035,-6.267261,4.39,9999.0,2.26,-1.68,...,,,,,,75,520,Rain,light intensity shower rain,09n
9016,1514772000,2018-01-01 02:00:00,0,Custom location,53.345035,-6.267261,5.39,9999.0,2.4,-0.47,...,18.5,,,,,40,802,Clouds,scattered clouds,03n
9017,1514775600,2018-01-01 03:00:00,0,Custom location,53.345035,-6.267261,5.39,9999.0,2.4,-0.47,...,,,,,,40,802,Clouds,scattered clouds,03n
9018,1514779200,2018-01-01 04:00:00,0,Custom location,53.345035,-6.267261,5.39,9999.0,2.4,-0.33,...,,,,,,40,802,Clouds,scattered clouds,03n


## Rename

In [13]:
df=df.rename(columns={'dt_iso':'day_hour'})

## Check the constants column

In [14]:
#Check the data of category type to see if there is a constant column
df_columns = df.columns
features_card = list(df[df_columns].columns.values)

print('{0:35}  {1}'.format("Feature", "Unique Values"))
print('{0:35}  {1}'.format("-------", "--------------- \n"))

for c in df_columns:
    print('{0:35}  {1}'.format(c, str(len(df[c].unique()))))

Feature                              Unique Values
-------                              --------------- 

dt                                   8760
day_hour                             8760
timezone                             2
city_name                            1
lat                                  1
lon                                  1
temp                                 639
visibility                           36
dew_point                            1109
feels_like                           1507
temp_min                             389
temp_max                             529
pressure                             63
sea_level                            1
grnd_level                           1
humidity                             73
wind_speed                           89
wind_deg                             165
wind_gust                            69
rain_1h                              196
rain_3h                              1
snow_1h                              49
snow_3h 

From the above results we can see that city_name, lat, lon,sea_level,grnd_level,rain_3h,snow_3h are constant columns, so delete

In [15]:
df.drop(labels=['city_name','lat','lon','sea_level','grnd_level','rain_3h','snow_3h'],axis=1,inplace=True)

In [16]:
df.head(5)

Unnamed: 0,dt,day_hour,timezone,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,...,wind_speed,wind_deg,wind_gust,rain_1h,snow_1h,clouds_all,weather_id,weather_main,weather_description,weather_icon
9014,1514764800,2018-01-01 00:00:00,0,4.39,9999.0,2.26,-1.93,4.36,4.56,990,...,12.9,240,18.0,,,40,520,Rain,light intensity shower rain,09n
9015,1514768400,2018-01-01 01:00:00,0,4.39,9999.0,2.26,-1.68,4.36,4.56,990,...,11.8,240,,,,75,520,Rain,light intensity shower rain,09n
9016,1514772000,2018-01-01 02:00:00,0,5.39,9999.0,2.4,-0.47,4.53,5.41,990,...,12.4,240,18.5,,,40,802,Clouds,scattered clouds,03n
9017,1514775600,2018-01-01 03:00:00,0,5.39,9999.0,2.4,-0.47,4.53,5.41,990,...,12.4,240,,,,40,802,Clouds,scattered clouds,03n
9018,1514779200,2018-01-01 04:00:00,0,5.39,9999.0,2.4,-0.33,5.36,5.56,989,...,11.8,240,,,,40,802,Clouds,scattered clouds,03n


In [17]:
#check for duplicate rows

#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', df.duplicated().sum())

# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])

Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


In [18]:
#check if there are any duplicate columns
# First transpose the df so columns become rows
dfT = df.T
print("Number of duplicate (excluding first) columns in the table is: ", dfT.duplicated().sum())
print("Number of duplicate (including first) columns in the table is: ",  dfT[dfT.duplicated(keep=False)].shape[0])

#.duplicate returns a Boolean Value so True will be returned if the column is a duplicate
print("Duplpicated columns will be True: ", dfT.duplicated())

Number of duplicate (excluding first) columns in the table is:  0
Number of duplicate (including first) columns in the table is:  0
Duplpicated columns will be True:  dt                     False
day_hour               False
timezone               False
temp                   False
visibility             False
dew_point              False
feels_like             False
temp_min               False
temp_max               False
pressure               False
humidity               False
wind_speed             False
wind_deg               False
wind_gust              False
rain_1h                False
snow_1h                False
clouds_all             False
weather_id             False
weather_main           False
weather_description    False
weather_icon           False
dtype: bool


In [19]:
df.head(5)

Unnamed: 0,dt,day_hour,timezone,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,...,wind_speed,wind_deg,wind_gust,rain_1h,snow_1h,clouds_all,weather_id,weather_main,weather_description,weather_icon
9014,1514764800,2018-01-01 00:00:00,0,4.39,9999.0,2.26,-1.93,4.36,4.56,990,...,12.9,240,18.0,,,40,520,Rain,light intensity shower rain,09n
9015,1514768400,2018-01-01 01:00:00,0,4.39,9999.0,2.26,-1.68,4.36,4.56,990,...,11.8,240,,,,75,520,Rain,light intensity shower rain,09n
9016,1514772000,2018-01-01 02:00:00,0,5.39,9999.0,2.4,-0.47,4.53,5.41,990,...,12.4,240,18.5,,,40,802,Clouds,scattered clouds,03n
9017,1514775600,2018-01-01 03:00:00,0,5.39,9999.0,2.4,-0.47,4.53,5.41,990,...,12.4,240,,,,40,802,Clouds,scattered clouds,03n
9018,1514779200,2018-01-01 04:00:00,0,5.39,9999.0,2.4,-0.33,5.36,5.56,989,...,11.8,240,,,,40,802,Clouds,scattered clouds,03n


## Check for missing values

In [20]:
# Prepare %missing column
cols = df.columns
columns_perc_missing  = 100 * (df[cols].isnull().sum()/df.shape[0])
#category_columns_perc_missing

# Store the values in a dataframe
df_perc_missing = pd.DataFrame(columns_perc_missing, columns=['%missing'])
df_perc_missing

Unnamed: 0,%missing
dt,0.0
day_hour,0.0
timezone,0.0
temp,0.0
visibility,0.441501
dew_point,0.0
feels_like,0.0
temp_min,0.0
temp_max,0.0
pressure,0.0


From the above results it can be observed that the %missing value of snow_1h is relatively high, but this is a key feature of the analysis results and is therefore not removed.

In [21]:
dupldate= df[df['day_hour'].duplicated()]

In [22]:
dupldate

Unnamed: 0,dt,day_hour,timezone,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,...,wind_speed,wind_deg,wind_gust,rain_1h,snow_1h,clouds_all,weather_id,weather_main,weather_description,weather_icon
9092,1515042000,2018-01-04 05:00:00,0,4.39,5000.0,3.36,0.04,4.36,4.56,988,...,6.2,120,,1.03,,75,501,Rain,moderate rain,10n
9094,1515045600,2018-01-04 06:00:00,0,5.39,6000.0,4.35,0.16,5.36,5.56,985,...,9.8,140,,0.59,,75,311,Drizzle,rain and drizzle,09n
9216,1515481200,2018-01-09 07:00:00,0,5.39,6000.0,4.35,0.62,5.36,5.56,1003,...,8.2,160,13.4,0.54,,75,300,Drizzle,light intensity drizzle,09n
9218,1515484800,2018-01-09 08:00:00,0,5.39,8000.0,4.35,-0.33,5.36,5.56,1002,...,11.8,140,,0.26,,75,500,Rain,light rain,10n
9220,1515488400,2018-01-09 09:00:00,0,5.39,5000.0,4.35,0.29,5.36,5.56,1002,...,9.3,140,,0.35,,75,500,Rain,light rain,10d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17865,1545562800,2018-12-23 11:00:00,0,7.44,8000.0,7.29,5.72,7.05,7.56,1016,...,2.6,60,,0.76,,75,500,Rain,light rain,10d
17867,1545566400,2018-12-23 12:00:00,0,7.58,9999.0,7.43,6.28,7.36,7.96,1017,...,2.1,30,,1.02,,75,501,Rain,moderate rain,10d
17869,1545570000,2018-12-23 13:00:00,0,7.58,7000.0,7.43,5.88,7.36,7.96,1017,...,2.6,350,,1.02,,75,501,Rain,moderate rain,10d
17871,1545573600,2018-12-23 14:00:00,0,7.48,9999.0,7.33,5.42,6.99,7.96,1017,...,3.1,350,,0.25,,75,300,Drizzle,light intensity drizzle,09d


As you can see from the above results there are 300 duplicates and therefore need to be removed

In [23]:
df.drop(df[df['day_hour'].duplicated()].index,inplace = True)

In [24]:
dupldate= df[df['day_hour'].duplicated()]

In [25]:
dupldate

Unnamed: 0,dt,day_hour,timezone,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,...,wind_speed,wind_deg,wind_gust,rain_1h,snow_1h,clouds_all,weather_id,weather_main,weather_description,weather_icon


In [26]:
df['rain_1h']=df['rain_1h'].fillna(value=0)

In [27]:
df['snow_1h']=df['snow_1h'].fillna(value=0)

In [28]:
df.shape

(8760, 21)

In [29]:
df.to_csv('weatherData_cleaned.csv', index_label=False)