In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_parquet('../data/raw/weather_data/weather_raw.parquet')

In [3]:
print(f"Number of instances: {len(df)}")
print(f"Number of features : {len(df.columns)}")

Number of instances: 365
Number of features : 28


In [4]:
column_names_list = df.columns.tolist()
print(column_names_list)

['STATION', 'DATE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', 'TEMP', 'TEMP_ATTRIBUTES', 'DEWP', 'DEWP_ATTRIBUTES', 'SLP', 'SLP_ATTRIBUTES', 'STP', 'STP_ATTRIBUTES', 'VISIB', 'VISIB_ATTRIBUTES', 'WDSP', 'WDSP_ATTRIBUTES', 'MXSPD', 'GUST', 'MAX', 'MAX_ATTRIBUTES', 'MIN', 'MIN_ATTRIBUTES', 'PRCP', 'PRCP_ATTRIBUTES', 'SNDP', 'FRSHTT']


In [5]:
df.tail(5)

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
360,72364099999,2023-12-27,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,39.6,24,10.4,24,...,8.9,999.9,53.6,*,28.4,*,0.0,I,999.9,0
361,72364099999,2023-12-28,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,41.5,24,11.1,24,...,7.0,999.9,55.4,*,26.6,*,0.0,I,999.9,0
362,72364099999,2023-12-29,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,40.7,24,14.5,24,...,7.0,999.9,57.2,*,26.6,*,0.0,I,999.9,0
363,72364099999,2023-12-30,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,39.7,24,14.6,24,...,5.1,999.9,55.4,*,26.6,*,0.0,I,999.9,0
364,72364099999,2023-12-31,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,43.0,24,12.4,24,...,15.0,20.0,60.8,*,24.8,*,0.0,I,999.9,0


STATION: The unique identifier for the weather station where the data was collected.
DATE: The date on which the observations were recorded (usually in the format YYYYMMDD).
LATITUDE: The latitude coordinate of the weather station.
LONGITUDE: The longitude coordinate of the weather station.
ELEVATION: The elevation of the weather station above sea level (measured in meters).
NAME: The name of the weather station.
TEMP: The temperature recorded at the station (measured in degrees Celsius or Fahrenheit).
TEMP_ATTRIBUTES: Attributes or qualifiers related to the temperature data, indicating any special conditions or adjustments.
DEWP: Dew point temperature (measured in degrees Celsius or Fahrenheit).
DEWP_ATTRIBUTES: Attributes or qualifiers related to the dew point temperature data.
SLP: Sea-level pressure (measured in millibars or inches of mercury).
SLP_ATTRIBUTES: Attributes or qualifiers related to the sea-level pressure data.
STP: Station pressure (measured in millibars or inches of mercury).
STP_ATTRIBUTES: Attributes or qualifiers related to the station pressure data.
VISIB: Visibility distance (measured in meters or miles).
VISIB_ATTRIBUTES: Attributes or qualifiers related to the visibility data.
WDSP: Wind speed (measured in meters per second or miles per hour).
WDSP_ATTRIBUTES: Attributes or qualifiers related to the wind speed data.
MXSPD: Maximum wind speed recorded during the observation period (measured in meters per second or miles per hour).
GUST: Wind gust speed (measured in meters per second or miles per hour).
MAX: Maximum temperature recorded during the observation period (measured in degrees Celsius or Fahrenheit).
MAX_ATTRIBUTES: Attributes or qualifiers related to the maximum temperature data.
MIN: Minimum temperature recorded during the observation period (measured in degrees Celsius or Fahrenheit).
MIN_ATTRIBUTES: Attributes or qualifiers related to the minimum temperature data.
PRCP: Precipitation amount (measured in millimeters or inches).
PRCP_ATTRIBUTES: Attributes or qualifiers related to the precipitation data.
SNDP: Snow depth (measured in centimeters or inches).
FRSHTT: Weather conditions (e.g., fog, rain, snow) represented by a coded value indicating various weather phenomena observed.


In [6]:
columns_to_remove = ['TEMP_ATTRIBUTES', 'DEWP_ATTRIBUTES', 'SLP_ATTRIBUTES', 
                      'STP_ATTRIBUTES', 'VISIB_ATTRIBUTES', 'WDSP_ATTRIBUTES', 
                      'MAX_ATTRIBUTES', 'MIN_ATTRIBUTES', 'PRCP_ATTRIBUTES']

df_cleaned = df.drop(columns=columns_to_remove, errors='ignore')
df_cleaned.head()


Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,72364099999,2023-01-01,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,51.7,38.6,9999.9,999.9,10.0,3.0,12.0,19.0,66.2,41.0,0.0,999.9,0
1,72364099999,2023-01-02,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,48.8,34.7,9999.9,999.9,9.2,5.7,20.0,29.9,62.6,33.8,99.99,999.9,111000
2,72364099999,2023-01-03,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,42.1,30.1,9999.9,999.9,10.0,6.8,22.9,31.1,53.6,33.8,0.0,999.9,0
3,72364099999,2023-01-04,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,43.0,29.6,9999.9,999.9,10.0,3.6,13.0,19.0,57.2,32.0,0.0,999.9,0
4,72364099999,2023-01-05,31.880444,-106.70325,1253.58,DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...,45.0,29.3,9999.9,999.9,10.0,0.0,999.9,999.9,60.8,30.2,0.0,999.9,0


In [7]:
df_cleaned.iloc[300]

STATION                                            72364099999
DATE                                                2023-10-28
LATITUDE                                             31.880444
LONGITUDE                                           -106.70325
ELEVATION                                              1253.58
NAME         DONA ANA CO INTERNATIONAL JETPORT ARPT SANTA T...
TEMP                                                      66.4
DEWP                                                      37.0
SLP                                                     9999.9
STP                                                      999.9
VISIB                                                     10.0
WDSP                                                       5.2
MXSPD                                                     14.0
GUST                                                      15.0
MAX                                                       80.6
MIN                                                    

In [8]:
columns_to_remove = ['STATION', 'LATITUDE', 'LONGITUDE', 
                      'ELEVATION', 'NAME', 'SLP', 
                      'STP', 'VISIB', 'SNDP', 'FRSHTT']

df_cleaned1 = df_cleaned.drop(columns=columns_to_remove, errors='ignore')


In [9]:
df_cleaned1

Unnamed: 0,DATE,TEMP,DEWP,WDSP,MXSPD,GUST,MAX,MIN,PRCP
0,2023-01-01,51.7,38.6,3.0,12.0,19.0,66.2,41.0,0.00
1,2023-01-02,48.8,34.7,5.7,20.0,29.9,62.6,33.8,99.99
2,2023-01-03,42.1,30.1,6.8,22.9,31.1,53.6,33.8,0.00
3,2023-01-04,43.0,29.6,3.6,13.0,19.0,57.2,32.0,0.00
4,2023-01-05,45.0,29.3,0.0,999.9,999.9,60.8,30.2,0.00
...,...,...,...,...,...,...,...,...,...
360,2023-12-27,39.6,10.4,3.5,8.9,999.9,53.6,28.4,0.00
361,2023-12-28,41.5,11.1,2.1,7.0,999.9,55.4,26.6,0.00
362,2023-12-29,40.7,14.5,2.3,7.0,999.9,57.2,26.6,0.00
363,2023-12-30,39.7,14.6,0.8,5.1,999.9,55.4,26.6,0.00


In [10]:
null_counts = df_cleaned1.isnull().sum()
print("Number of null values in each column:\n", null_counts)


Number of null values in each column:
 DATE     0
TEMP     0
DEWP     0
WDSP     0
MXSPD    0
GUST     0
MAX      0
MIN      0
PRCP     0
dtype: int64


In [11]:
import os

processed_dir = "../data/curated"
os.makedirs(processed_dir, exist_ok=True)
df_cleaned1.to_csv(os.path.join(processed_dir, "weather_daily.csv"))
