In [61]:
import pandas as pd
import datetime

In [60]:
STRATHSPEY_PATH = "data/Strathspey-weather.xlsx"
STRATHSPEY_COLUMNS = {
    'date': pd.Timestamp,
    'temp_mean': str,
    'temp_min': str,
    'temp_max': str,
    'rain_mm': str,
    'pressure_early': int,
    'pressure_late': int,
    'wind_mean': float,
    'wind_max': float,
    'wind_dir': str,
    'sun_hours': float
}


In [63]:
# Load all sheets from Excel
strathspey_excel = pd.read_excel(
    STRATHSPEY_PATH,
    sheet_name=None,   # load all sheets
    header=None        # no headers in raw file
)

In [62]:
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,UTC,0900 UTC,0900 UTC,,UTC,UTC,UTC,UTC,UTC,UTC
1,1.0,2.6,0.2,5,8.8,994,1011,5.7,23,SSW,0.31
2,2.0,6.1,1.5,10.9,0.1,1010,1012,6.8,29,SW,2.94
3,3.0,8.9,3.9,10.4,0.0,1021,1024,10.6,32,SSW,1.98
4,4.0,8.5,7.2,10.4,0.0,1028,1029,8.5,32,SSW,0


In [None]:
import pandas as pd
import datetime


In [64]:
def drop_empty_rows(df):
    """Remove rows that are completely empty."""
    return df.dropna(how='all')

def clean_str_columns(value):
    """Clean string values (strip whitespace, handle missing)."""
    return value.strip() if isinstance(value, str) else value


In [65]:
# Example: strathspey_excel is a dict of DataFrames keyed by 'YYMM'
print(strathspey_excel.keys())


dict_keys(['1112', '1201', '1202', '1203', '1204', '1205', '1206', '1207', '1208', '1209', '1210', '1211', '1212', '1301'])


In [66]:
for key in strathspey_excel.keys():
    year = 2000 + int(key[:2])   # first two chars → year
    month = int(key[2:])         # last two chars → month
    print(f"Processing data for {year}-{month:02d}")


Processing data for 2011-12
Processing data for 2012-01
Processing data for 2012-02
Processing data for 2012-03
Processing data for 2012-04
Processing data for 2012-05
Processing data for 2012-06
Processing data for 2012-07
Processing data for 2012-08
Processing data for 2012-09
Processing data for 2012-10
Processing data for 2012-11
Processing data for 2012-12
Processing data for 2013-01


In [71]:
df = strathspey_excel[key]

# Drop first 5 rows (metadata, headers, etc.)
df = df.drop([x for x in range(6)], ).reset_index(drop=True)

# Remove empty rows
df = drop_empty_rows(df)


In [72]:
df.columns = STRATHSPEY_COLUMNS
df.head()


Unnamed: 0,date,temp_mean,temp_min,temp_max,rain_mm,pressure_early,pressure_late,wind_mean,wind_max,wind_dir,sun_hours
0,1,2.6,0.2,5.0,8.8,994,1011,5.7,23,SSW,0.31
1,2,6.1,1.5,10.9,0.1,1010,1012,6.8,29,SW,2.94
2,3,8.9,3.9,10.4,0.0,1021,1024,10.6,32,SSW,1.98
3,4,8.5,7.2,10.4,0.0,1028,1029,8.5,32,SSW,0.0
4,5,5.8,1.4,10.2,0.1,1026,1024,3.8,23,SSW,0.0


In [73]:
df['date'] = pd.to_datetime(df['date'].apply(lambda x: datetime.date(year, month, x)))

In [74]:
for col in df.columns:
    if isinstance(df[col][0], str):
        df[col] = df[col].apply(clean_str_columns)


In [75]:
df.head()

Unnamed: 0,date,temp_mean,temp_min,temp_max,rain_mm,pressure_early,pressure_late,wind_mean,wind_max,wind_dir,sun_hours
0,2013-01-01,2.6,0.2,5.0,8.8,994,1011,5.7,23,SSW,0.31
1,2013-01-02,6.1,1.5,10.9,0.1,1010,1012,6.8,29,SW,2.94
2,2013-01-03,8.9,3.9,10.4,0.0,1021,1024,10.6,32,SSW,1.98
3,2013-01-04,8.5,7.2,10.4,0.0,1028,1029,8.5,32,SSW,0.0
4,2013-01-05,5.8,1.4,10.2,0.1,1026,1024,3.8,23,SSW,0.0
