In [84]:
import pandas as pd
import datetime

In [85]:
STRATHSPEY_PATH = "data/Strathspey-weather.xlsx"
STRATHSPEY_COLUMNS = {
    'date': pd.Timestamp,
    'temp_mean': str,
    'temp_min': str,
    'temp_max': str,
    'rain_mm': str,
    'pressure_early': int,
    'pressure_late': int,
    'wind_mean': float,
    'wind_max': float,
    'wind_dir': str,
    'sun_hours': float
}


In [86]:
# Load all sheets from Excel
strathspey_excel = pd.read_excel(
    STRATHSPEY_PATH,
    sheet_name=None,   # load all sheets
    header=None        # no headers in raw file
)

In [87]:
df.head()


Unnamed: 0,date,temp_mean,temp_min,temp_max,rain_mm,pressure_early,pressure_late,wind_mean,wind_max,wind_dir,sun_hours
0,2013-01-01,2.6,0.2,5.0,8.8,994,1011,5.7,23,SSW,0.31
1,2013-01-02,6.1,1.5,10.9,0.1,1010,1012,6.8,29,SW,2.94
2,2013-01-03,8.9,3.9,10.4,0.0,1021,1024,10.6,32,SSW,1.98
3,2013-01-04,8.5,7.2,10.4,0.0,1028,1029,8.5,32,SSW,0.0
4,2013-01-05,5.8,1.4,10.2,0.1,1026,1024,3.8,23,SSW,0.0


In [88]:
import pandas as pd
import datetime


In [104]:
def drop_empty_rows(df):
    """Remove rows that are completely empty."""
    return df.dropna(how='all')

def clean_str_columns(value):
    """Clean string values (strip whitespace, handle missing)."""
    return value.strip() if isinstance(value, str) else value

def clean_rain_mm(value):
    """Clean rain_mm column: handle TR (trace), remove 's' suffix, convert to float."""
    if pd.isna(value):
        return None
    value_str = str(value).strip()
    if value_str.upper() == 'TR':  # TR means trace rainfall
        return 0.0
    # Remove 's' suffix (snowfall indicator)
    value_str = value_str.rstrip('s').strip()
    try:
        return float(value_str)
    except ValueError:
        return None

In [115]:
# Example: strathspey_excel is a dict of DataFrames keyed by 'YYMM'
print(strathspey_excel.keys())

strathspey_df = []  # Initialize list to collect all dataframes

for key in strathspey_excel.keys():
    year = 2000 + int(key[:2])   # first two chars → year
    month = int(key[2:])         # last two chars → month
    print(f"Processing data for {year}-{month:02d}")
    
    # Extract and clean this sheet
    df = strathspey_excel[key]
    
    # Drop first 5 rows (metadata, headers, etc.)
    df = df.drop([x for x in range(6)], ).reset_index(drop=True)
    
    # Remove empty rows
    df = drop_empty_rows(df)
    
    # Assign column names
    df.columns = STRATHSPEY_COLUMNS
    
    # Filter to keep only rows where date is numeric
    df['date'] = pd.to_numeric(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    
    # Convert date column - create datetime directly using pandas
    df['date'] = df['date'].apply(lambda x: pd.Timestamp(year=year, month=month, day=int(x)))
        # Remove rows with too many missing values (more than 3 NaN values)
    df = df.dropna(thresh=len(df.columns) - 3)
    # Clean string columns
    for col in df.columns:
        if isinstance(df[col][0], str):
            df[col] = df[col].apply(clean_str_columns)
    
    # Clean rain_mm column specifically
    df['rain_mm'] = df['rain_mm'].apply(clean_rain_mm)
    
    strathspey_df.append(df)  # Add to list

# Concatenate all dataframes into one
strathspey_df = pd.concat(strathspey_df, ignore_index=True)

dict_keys(['1112', '1201', '1202', '1203', '1204', '1205', '1206', '1207', '1208', '1209', '1210', '1211', '1212', '1301'])
Processing data for 2011-12
Processing data for 2012-01
Processing data for 2012-02
Processing data for 2012-03
Processing data for 2012-04
Processing data for 2012-05
Processing data for 2012-06
Processing data for 2012-07
Processing data for 2012-08
Processing data for 2012-09
Processing data for 2012-10
Processing data for 2012-11
Processing data for 2012-12
Processing data for 2013-01


In [116]:
strathspey_df = drop_empty_rows(strathspey_df)

In [106]:
strathspey_df.head()

Unnamed: 0,date,temp_mean,temp_min,temp_max,rain_mm,pressure_early,pressure_late,wind_mean,wind_max,wind_dir,sun_hours
0,2011-12-01,3.6,-0.4,4.6,0.0,995,1004,4.1,36,SSW,2.7
1,2011-12-02,2.6,-1.5,7.5,5.5,1003,983,6.5,36,SSW,1.8
2,2011-12-03,2.4,0.4,4.4,6.0,986,988,9.5,46,SSW,0.75
3,2011-12-04,-1.6,-0.9,0.4,3.7,988,992,4.5,22,SSW,1.66
4,2011-12-05,-1.6,-4.2,0.2,2.9,994,998,2.8,17,SSW,1.08


In [117]:
# Save the consolidated DataFrame to an Excel file
output_path = "data/strathspey_combined.xlsx"
strathspey_df.to_excel(output_path, index=False)
print(f"Saved {len(strathspey_df)} rows to {output_path}")

Saved 426 rows to data/strathspey_combined.xlsx
