In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import geopandas as gpd    
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.reset_defaults()
import geoplot as gplt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
#read data

df = pd.read_csv("weatherAUS.csv")
print(df.head())
#df.info()


## Find categorical variables

categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('Categorical variables are :', categorical)

## find missing values in categorical variables

#print(df[categorical].isnull().sum())

##frequency of categorical variables

#for var in categorical: 
        
#print(df[var].value_counts())

##check for cardinality in categorical variables

#for var in categorical:
    
#    print(var, ' contains ', len(df[var].unique()), ' labels')

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [28]:
## date variable contains 3436 labels so needs to be split into year/month/day

#print(df["Date"].dtypes)

df['Date']= pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year

df['Month'] = df['Date'].dt.month

df['Day'] = df['Date'].dt.day

df.drop('Date', axis=1, inplace = True)

#start looking into other categorical variables

#print('Location contains', len(df.Location.unique()), 'labels')

#print(df.Location.unique())

#one-hot encoding for categorical variables

# add most popular values for missing categorical values

for df2 in [df]:
    df2['WindGustDir'] = df2['WindGustDir'].fillna(df2['WindGustDir'].mode()[0])
    df2['WindDir9am'] = df2['WindDir9am'].fillna(df2['WindDir9am'].mode()[0])
    df2['WindDir3pm'] = df2['WindDir3pm'].fillna(df2['WindDir3pm'].mode()[0])
    df2['RainToday'] = df2['RainToday'].fillna(df2['RainToday'].mode()[0])



In [30]:
## kaartide jaoks location vajalik
df_map = pd.get_dummies(df, columns=[ 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], drop_first=True, dummy_na=True)

## muude toimetuste jaoks
df = pd.get_dummies(df, columns=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday'], drop_first=True, dummy_na=True)

In [33]:
## Converting column 'RainTomorrow' to numeric variable

df['RainTomorrow'] = df['RainTomorrow'].map({'Yes': 1, 'No': 0})

## Handling missing values in column 'RainTomorrow'
df = df.dropna(subset=['RainTomorrow'])  # Drop rows with NaN in target

In [35]:
#explore numerical variables

numerical = [var for var in df.columns if df[var].dtype!='O']

#print('There are {} numerical variables\n'.format(len(numerical)))

#print('The numerical variables are :', numerical)

#19 numerical variables, all continuous type
#check for missing values

print(df[numerical].isnull().sum())

print(round(df[numerical].describe()),2)

MinTemp             637
MaxTemp             322
Rainfall           1406
Evaporation       60843
Sunshine          67816
                  ...  
WindDir3pm_WNW        0
WindDir3pm_WSW        0
WindDir3pm_nan        0
RainToday_Yes         0
RainToday_nan         0
Length: 119, dtype: int64
        MinTemp   MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
count  141556.0  141871.0  140787.0      81350.0   74377.0       132923.0   
mean       12.0      23.0       2.0          5.0       8.0           40.0   
std         6.0       7.0       8.0          4.0       4.0           14.0   
min        -8.0      -5.0       0.0          0.0       0.0            6.0   
25%         8.0      18.0       0.0          3.0       5.0           31.0   
50%        12.0      23.0       0.0          5.0       8.0           39.0   
75%        17.0      28.0       1.0          7.0      11.0           48.0   
max        34.0      48.0     371.0        145.0      14.0          135.0   

       WindSpeed

In [139]:
##plots

plt.figure(figsize=(15,10))


plt.subplot(2, 2, 1)
fig = df.boxplot(column='Rainfall')
fig.set_title('')
fig.set_ylabel('Rainfall')


plt.subplot(2, 2, 2)
fig = df.boxplot(column='Evaporation')
fig.set_title('')
fig.set_ylabel('Evaporation')


plt.subplot(2, 2, 3)
fig = df.boxplot(column='WindSpeed9am')
fig.set_title('')
fig.set_ylabel('WindSpeed9am')


plt.subplot(2, 2, 4)
fig = df.boxplot(column='WindSpeed3pm')
fig.set_title('')
fig.set_ylabel('WindSpeed3pm')


plt.show()



In [14]:
## getting into prediction stuff

X = df.drop(['RainTomorrow'], axis=1)

y = df['RainTomorrow']


from sklearn.impute import SimpleImputer

## Replacing missing values?

numerical_cols = X.select_dtypes(include=['float', 'int']).columns
num_imputer = SimpleImputer(strategy='mean')  # Replace NaNs with mean for numerical features
X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])

In [37]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 15)

X_train.shape, X_test.shape

((113754, 118), (28439, 118))

In [39]:
## Scaling data 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Initializing logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model_logreg = LogisticRegression()
model_logreg.fit(X_train, y_train)

## Predictions and evaluation
y_pred = model_logreg.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.8473575020218713
[[20827  1185]
 [ 3156  3271]]
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     22012
         1.0       0.73      0.51      0.60      6427

    accuracy                           0.85     28439
   macro avg       0.80      0.73      0.75     28439
weighted avg       0.84      0.85      0.84     28439



In [40]:
#kaardi genemine (see on lic temperatuuride kaart kindlal ajahetkel)
## peab kasutama df_map!!!

import geopandas as gpd
import geoplot as gplt
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import cartopy.crs as ccrs
import cartopy.feature as cfeature


df_filtered = df_map[(df['Year'] == 2012) & (df['Month'] == 4) & (df['Day'] == 30)].copy()

# 2. Geocode the locations to get latitude and longitude
geolocator = Nominatim(user_agent="geo_plotting")

# Create lists to store latitudes and longitudes
lons = []
lats = []

for location in df_filtered['Location']:
    location_info = geolocator.geocode(location + ", Australia")
    if location_info:
        lons.append(location_info.longitude)
        lats.append(location_info.latitude)
    else:
        lons.append(np.nan)
        lats.append(np.nan)

# 3. Add latitude and longitude to the filtered dataframe using .loc
df_filtered.loc[:, 'Longitude'] = lons
df_filtered.loc[:, 'Latitude'] = lats

# 4. Drop rows with NaN coordinates (e.g., missing longitude/latitude) or missing MaxTemp values
df_filtered = df_filtered.dropna(subset=['Longitude', 'Latitude', 'MaxTemp'])

# 5. Ensure lengths of coordinates and MaxTemp are the same
lons = df_filtered['Longitude']
lats = df_filtered['Latitude']
temps = df_filtered['MaxTemp']

# 6. Plot the locations with temperature labels
plt.figure(figsize=(10, 8))

# Create a Cartopy map with PlateCarree projection (for global lat-lon coordinates)
ax = plt.axes(projection=ccrs.PlateCarree())

# Add a natural coastline feature (cartopy feature)
ax.add_feature(cfeature.COASTLINE, edgecolor='black')
ax.add_feature(cfeature.BORDERS, linestyle=':')

# Add state boundaries for Australia
ax.add_feature(cfeature.STATES, linestyle=':', edgecolor='gray')

# Plot the location points on the map
sc = ax.scatter(lons, lats, c=temps, cmap='coolwarm', edgecolors='k', s=100)

# Add temperature labels at each location
for idx, row in df_filtered.iterrows():
    ax.text(row['Longitude'], row['Latitude'], f"{row['MaxTemp']}°C", fontsize=12, ha='center', color='black')

# Add a colorbar for the temperature scale
cbar = plt.colorbar(sc, ax=ax, label='Max Temperature (°C)')

# Add labels and title
plt.title("Max Temperature across Australia on 30 April 2012")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

# Show the plot
plt.show()


  df_filtered = df_map[(df['Year'] == 2012) & (df['Month'] == 4) & (df['Day'] == 30)].copy()


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [62]:
## average max temp for all weather stations for january 

import geopandas as gpd
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import numpy as np
from scipy.interpolate import griddata
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from pykrige.ok import OrdinaryKriging

krige = OrdinaryKriging(lons, lats, temps, variogram_model='linear')

# 1. Filter the dataset for January
df_january = df_map[df_map['Month'] == 1].copy()

# 2. Drop rows with missing temperature data
df_january = df_january.dropna(subset=['MaxTemp'])

# 3. Group by weather station and calculate the average temperature
# Assuming 'Location' represents the weather station
avg_temps_january = df_january.groupby('Location')['MaxTemp'].mean().reset_index()

# 4. Rename columns for clarity
avg_temps_january.columns = ['Location', 'MaxTemp']

# 5. Display the result
##print(avg_temps_january)


# 2. Geocode the locations to get latitude and longitude
geolocator = Nominatim(user_agent="geo_plotting")

# Create lists to store latitudes and longitudes
lons = []
lats = []

for location in avg_temps_january['Location']:
    location_info = geolocator.geocode(location + ", Australia")
    if location_info:
        lons.append(location_info.longitude)
        lats.append(location_info.latitude)
    else:
        lons.append(np.nan)
        lats.append(np.nan)

# Add latitude and longitude to the filtered dataframe using .loc
avg_temps_january.loc[:, 'Longitude'] = lons
avg_temps_january.loc[:, 'Latitude'] = lats

# Drop rows with NaN coordinates or missing MaxTemp values
avg_temps_january = avg_temps_january.dropna(subset=['Longitude', 'Latitude', 'MaxTemp'])

# Extract data for interpolation
lons = avg_temps_january['Longitude'].values
lats = avg_temps_january['Latitude'].values
temps = avg_temps_january['MaxTemp'].values

# 3. Define a grid for interpolation
lon_min, lon_max = lons.min() - 1, lons.max() + 1
lat_min, lat_max = lats.min() - 1, lats.max() + 1
lon_grid, lat_grid = np.meshgrid(
    np.linspace(lon_min, lon_max, 200), 
    np.linspace(lat_min, lat_max, 200)
)

# 4. Interpolate temperature values onto the grid
temp_grid, _ = krige.execute(
    'grid', 
    np.linspace(lon_min, lon_max, 200), 
    np.linspace(lat_min, lat_max, 200)
)



# 5. Plot the data
plt.figure(figsize=(12, 10))

# Create a Cartopy map with PlateCarree projection
ax = plt.axes(projection=ccrs.PlateCarree())

# Add map features
ax.add_feature(cfeature.COASTLINE, edgecolor='black')
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.STATES, linestyle=':', edgecolor='gray')

# Plot interpolated temperature as a contour map
contour = ax.contourf(
    lon_grid, lat_grid, temp_grid, 
    levels=20, cmap='coolwarm', transform=ccrs.PlateCarree()
)

# Add the original data points
sc = ax.scatter(lons, lats, c=temps, cmap='coolwarm', edgecolors='k', s=100, label='Data Points')

# Add a colorbar for the temperature scale
cbar = plt.colorbar(contour, ax=ax, orientation='vertical', label='average max Temperature (°C)')

# Add labels and title
plt.title("Interpolated average Max Temperature across Australia in january")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

# Show the plot
plt.show()

In [49]:
#interpoleeriv kaart, pead täpsustama kuupäva df filtered all, kriging näeb hea välja
# kasuta df_map
#pip install pykrige
import geopandas as gpd
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import numpy as np
from scipy.interpolate import griddata
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from pykrige.ok import OrdinaryKriging

krige = OrdinaryKriging(lons, lats, temps, variogram_model='linear')


# 1. Filter data for a specific date
df_filtered = df_map[(df['Year'] == 2015) & (df['Month'] == 1) & (df['Day'] == 1)].copy()

# 2. Geocode the locations to get latitude and longitude
geolocator = Nominatim(user_agent="geo_plotting")

# Create lists to store latitudes and longitudes
lons = []
lats = []

for location in df_filtered['Location']:
    location_info = geolocator.geocode(location + ", Australia")
    if location_info:
        lons.append(location_info.longitude)
        lats.append(location_info.latitude)
    else:
        lons.append(np.nan)
        lats.append(np.nan)

# Add latitude and longitude to the filtered dataframe using .loc
df_filtered.loc[:, 'Longitude'] = lons
df_filtered.loc[:, 'Latitude'] = lats

# Drop rows with NaN coordinates or missing MaxTemp values
df_filtered = df_filtered.dropna(subset=['Longitude', 'Latitude', 'MaxTemp'])

# Extract data for interpolation
lons = df_filtered['Longitude'].values
lats = df_filtered['Latitude'].values
temps = df_filtered['MaxTemp'].values

# 3. Define a grid for interpolation
lon_min, lon_max = lons.min() - 1, lons.max() + 1
lat_min, lat_max = lats.min() - 1, lats.max() + 1
lon_grid, lat_grid = np.meshgrid(
    np.linspace(lon_min, lon_max, 200), 
    np.linspace(lat_min, lat_max, 200)
)

# 4. Interpolate temperature values onto the grid
temp_grid, _ = krige.execute(
    'grid', 
    np.linspace(lon_min, lon_max, 200), 
    np.linspace(lat_min, lat_max, 200)
)



# 5. Plot the data
plt.figure(figsize=(12, 10))

# Create a Cartopy map with PlateCarree projection
ax = plt.axes(projection=ccrs.PlateCarree())

# Add map features
ax.add_feature(cfeature.COASTLINE, edgecolor='black')
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.STATES, linestyle=':', edgecolor='gray')

# Plot interpolated temperature as a contour map
contour = ax.contourf(
    lon_grid, lat_grid, temp_grid, 
    levels=20, cmap='coolwarm', transform=ccrs.PlateCarree()
)

# Add the original data points
sc = ax.scatter(lons, lats, c=temps, cmap='coolwarm', edgecolors='k', s=100, label='Data Points')

# Add a colorbar for the temperature scale
cbar = plt.colorbar(contour, ax=ax, orientation='vertical', label='Max Temperature (°C)')

# Add labels and title
plt.title("Interpolated Max Temperature across Australia on 30 April 2012")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

# Show the plot
plt.show()



In [42]:
# merge year month day back together
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
df = df.set_index('Date')

# Drop Year, Month, Day columns if not needed
df = df.drop(['Year', 'Month', 'Day'], axis=1)

In [46]:
df

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,WindDir3pm_nan,RainToday_Yes,RainToday_nan
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-12-01,13.4,22.9,0.6,,,44.0,20.0,24.0,71.0,22.0,...,False,False,False,False,False,True,False,False,False,False
2008-12-02,7.4,25.1,0.0,,,44.0,4.0,22.0,44.0,25.0,...,False,False,False,False,False,False,True,False,False,False
2008-12-03,12.9,25.7,0.0,,,46.0,19.0,26.0,38.0,30.0,...,False,False,False,False,False,False,True,False,False,False
2008-12-04,9.2,28.0,0.0,,,24.0,11.0,9.0,45.0,16.0,...,False,False,False,False,False,False,False,False,False,False
2008-12-05,17.5,32.3,1.0,,,41.0,7.0,20.0,82.0,33.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-20,3.5,21.8,0.0,,,31.0,15.0,13.0,59.0,27.0,...,False,False,False,False,False,False,False,False,False,False
2017-06-21,2.8,23.4,0.0,,,31.0,13.0,11.0,51.0,24.0,...,False,False,False,False,False,False,False,False,False,False
2017-06-22,3.6,25.3,0.0,,,22.0,13.0,9.0,56.0,21.0,...,False,False,False,False,False,False,False,False,False,False
2017-06-23,5.4,26.9,0.0,,,37.0,9.0,9.0,53.0,24.0,...,False,False,False,False,False,True,False,False,False,False


In [52]:
#sliding time window



def create_sliding_window(df, features, window_size, forecast_horizon=1, target='MaxTemp'):
    """
    Create a sliding window dataset for time series forecasting.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        features (list): List of feature column names.
        window_size (int): Number of past time steps to include.
        forecast_horizon (int): Number of steps ahead to predict.
        target (str): The target column name.

    Returns:
        pd.DataFrame: Feature matrix (X).
        pd.Series: Target variable (y).
    """
    X = []
    y = []
    
    for i in range(len(df) - window_size - forecast_horizon + 1):
        # Create sliding window features for the given columns
        feature_values = df[features].iloc[i:i + window_size].values.flatten()
        X.append(feature_values)
        # Target is the value at the forecast horizon
        y.append(df[target].iloc[i + window_size + forecast_horizon - 1])
    
    # Generate column names for features
    feature_columns = [
        f"{col}_Lag_{j+1}" for j in range(window_size) for col in features
    ]
    
    return pd.DataFrame(X, columns=feature_columns), pd.Series(y, name='Target')

# Parameters
features = ['MinTemp', 'MaxTemp', 'Rainfall']
window_size = 7  # Use past 3 days as features
forecast_horizon = 1  # Predict 1 day ahead

# Create sliding window dataset
X, y = create_sliding_window(df, features, window_size, forecast_horizon)

# Combine features and target for clarity
sliding_window_df = pd.concat([X, y], axis=1)

print(sliding_window_df)


        MinTemp_Lag_1  MaxTemp_Lag_1  Rainfall_Lag_1  MinTemp_Lag_2  \
0                13.4           22.9             0.6            7.4   
1                 7.4           25.1             0.0           12.9   
2                12.9           25.7             0.0            9.2   
3                 9.2           28.0             0.0           17.5   
4                17.5           32.3             1.0           14.6   
...               ...            ...             ...            ...   
142185            6.4           23.4             0.0            8.0   
142186            8.0           20.7             0.0            7.4   
142187            7.4           20.6             0.0            3.5   
142188            3.5           21.8             0.0            2.8   
142189            2.8           23.4             0.0            3.6   

        MaxTemp_Lag_2  Rainfall_Lag_2  MinTemp_Lag_3  MaxTemp_Lag_3  \
0                25.1             0.0           12.9           25.7   
1    

In [54]:
def create_sliding_window(df, features, window_size, forecast_horizon=1, target='MaxTemp'):
    X = []
    y = []
    for i in range(len(df) - window_size - forecast_horizon + 1):
        feature_values = df[features].iloc[i:i + window_size].values.flatten()
        X.append(feature_values)
        y.append(df[target].iloc[i + window_size + forecast_horizon - 1])
    feature_columns = [
        f"{col}_Lag_{j+1}" for j in range(window_size) for col in features
    ]
    return pd.DataFrame(X, columns=feature_columns), pd.Series(y, name='Target')

X, y = create_sliding_window(df, features, window_size)
model.fit(X, y)

# Predict for 10th February 2015
target_date = pd.Timestamp('2015-02-10')

# Ensure we have enough data before the target date
if target_date not in df.index:
    raise ValueError(f"Target date {target_date} is not in the dataset.")
start_date = target_date - pd.Timedelta(days=window_size)
if start_date not in df.index:
    raise ValueError(f"Insufficient data to create sliding window for {target_date}.")

# Extract past window_size days of data
input_features = df.loc[start_date:target_date - pd.Timedelta(days=1), features].values.flatten()

# Ensure input length matches the model's expected input shape
if len(input_features) != window_size * len(features):
    raise ValueError(f"Insufficient data to create sliding window for {target_date}.")

# Make the prediction
predicted_value = model.predict([input_features])[0]
print(f"Predicted MaxTemp for {target_date.date()}: {predicted_value:.2f}°C")

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values