In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import geopandas as gpd    
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.reset_defaults()
import geoplot as gplt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import matplotlib.colors as mcolors
from scipy.interpolate import griddata
#read data

#data = r"C:\Users\mihke\OneDrive\Documents\projekt\Aus_weather_forecast\Andmed\weatherAUS.csv"
data = r"C:\Users\60104167012\OneDrive - Keskkonnaministeerium\projekt_aus_weather\projekt\Andmed\weatherAUS.csv"
df = pd.read_csv(data)
print(df.head())
#df.info()


## Find categorical variables

categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('Categorical variables are :', categorical)

## find missing values in categorical variables

#print(df[categorical].isnull().sum())

##frequency of categorical variables

#for var in categorical: 
        
#print(df[var].value_counts())

##check for cardinality in categorical variables

#for var in categorical:
    
#    print(var, ' contains ', len(df[var].unique()), ' labels')

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [9]:
## date variable contains 3436 labels so needs to be split into year/month/day

#print(df["Date"].dtypes)

df['Date']= pd.to_datetime(df['Date'])

df['Year'] = df['Date'].dt.year

df['Month'] = df['Date'].dt.month

df['Day'] = df['Date'].dt.day

df.drop('Date', axis=1, inplace = True)

#start looking into other categorical variables

#print('Location contains', len(df.Location.unique()), 'labels')

#print(df.Location.unique())

#one-hot encoding for location variables

pd.get_dummies(df.Location, drop_first=True).astype(int).head()

#one-hot encoding for wind gust direction variables, also add dummy for nan values

pd.get_dummies(df.WindGustDir, drop_first=True, dummy_na=True).astype(int).head()

#one-hot encoding for wind dir 9am variables, also add dummy for nan values

pd.get_dummies(df.WindDir9am, drop_first=True, dummy_na=True).astype(int).head()

#one-hot encoding for wind dir 9am variables, also add dummy for nan values

pd.get_dummies(df.WindDir3pm, drop_first=True, dummy_na=True).astype(int).head()

#one-hot encoding for raintoday variable, add dummy for nan values

pd.get_dummies(df.RainToday, drop_first=True, dummy_na=True).astype(int).head()


Unnamed: 0,Yes,NaN
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [11]:
#explore numerical variables

numerical = [var for var in df.columns if df[var].dtype!='O']

#print('There are {} numerical variables\n'.format(len(numerical)))

#print('The numerical variables are :', numerical)

#19 numerical variables, all continuous type
#check for missing values

print(df[numerical].isnull().sum())

print(round(df[numerical].describe()),2)

MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustSpeed    10263
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
Year                 0
Month                0
Day                  0
dtype: int64
        MinTemp   MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
count  143975.0  144199.0  142199.0      82670.0   75625.0       135197.0   
mean       12.0      23.0       2.0          5.0       8.0           40.0   
std         6.0       7.0       8.0          4.0       4.0           14.0   
min        -8.0      -5.0       0.0          0.0       0.0            6.0   
25%         8.0      18.0       0.0          3.0       5.0           31.0   
50%        12.0      23.0       0.0          5.0       8.0           39.0   
75%        

In [10]:
# rainfall, evaporation, windspeed9am and windspeed 3pm might contain extreme outliers

# using interquantile ranges to define outliers

# find outliers for Rainfall variable

IQR = df.Rainfall.quantile(0.75) - df.Rainfall.quantile(0.25)
Lower_fence = df.Rainfall.quantile(0.25) - (IQR * 3)
Upper_fence = df.Rainfall.quantile(0.75) + (IQR * 3)
print('Rainfall outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

#find outliers for other numerical variables

IQR = df.Evaporation.quantile(0.75) - df.Evaporation.quantile(0.25)
Lower_fence = df.Evaporation.quantile(0.25) - (IQR * 3)
Upper_fence = df.Evaporation.quantile(0.75) + (IQR * 3)
print('Evaporation outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

IQR = df.WindSpeed9am.quantile(0.75) - df.WindSpeed9am.quantile(0.25)
Lower_fence = df.WindSpeed9am.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed9am.quantile(0.75) + (IQR * 3)
print('WindSpeed9am outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

IQR = df.WindSpeed3pm.quantile(0.75) - df.WindSpeed3pm.quantile(0.25)
Lower_fence = df.WindSpeed3pm.quantile(0.25) - (IQR * 3)
Upper_fence = df.WindSpeed3pm.quantile(0.75) + (IQR * 3)
print('WindSpeed3pm outliers are values < {lowerboundary} or > {upperboundary}'.format(lowerboundary=Lower_fence, upperboundary=Upper_fence))

Rainfall outliers are values < -2.4000000000000004 or > 3.2
Evaporation outliers are values < -11.800000000000002 or > 21.800000000000004
WindSpeed9am outliers are values < -29.0 or > 55.0
WindSpeed3pm outliers are values < -20.0 or > 57.0


In [12]:
##plots

plt.figure(figsize=(15,10))


plt.subplot(2, 2, 1)
fig = df.boxplot(column='Rainfall')
fig.set_title('')
fig.set_ylabel('Rainfall')


plt.subplot(2, 2, 2)
fig = df.boxplot(column='Evaporation')
fig.set_title('')
fig.set_ylabel('Evaporation')


plt.subplot(2, 2, 3)
fig = df.boxplot(column='WindSpeed9am')
fig.set_title('')
fig.set_ylabel('WindSpeed9am')


plt.subplot(2, 2, 4)
fig = df.boxplot(column='WindSpeed3pm')
fig.set_title('')
fig.set_ylabel('WindSpeed3pm')


plt.show()



In [13]:
## getting into prediction stuff

X = df.drop(['RainTomorrow'], axis=1)

y = df['RainTomorrow']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 15)

X_train.shape, X_test.shape

((116368, 24), (29092, 24))

In [14]:

#kaardi genemine (see on lic temperatuuride kaart kindlal ajahetkel)

import geopandas as gpd
import geoplot as gplt
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import cartopy.crs as ccrs
import cartopy.feature as cfeature


df_filtered = df[(df['Year'] == 2012) & (df['Month'] == 4) & (df['Day'] == 30)].copy()

# 2. Geocode the locations to get latitude and longitude
geolocator = Nominatim(user_agent="geo_plotting")

# Create lists to store latitudes and longitudes
lons = []
lats = []

for location in df_filtered['Location']:
    location_info = geolocator.geocode(location + ", Australia")
    if location_info:
        lons.append(location_info.longitude)
        lats.append(location_info.latitude)
    else:
        lons.append(np.nan)
        lats.append(np.nan)

# 3. Add latitude and longitude to the filtered dataframe using .loc
df_filtered.loc[:, 'Longitude'] = lons
df_filtered.loc[:, 'Latitude'] = lats

# 4. Drop rows with NaN coordinates (e.g., missing longitude/latitude) or missing MaxTemp values
df_filtered = df_filtered.dropna(subset=['Longitude', 'Latitude', 'MaxTemp'])

# 5. Ensure lengths of coordinates and MaxTemp are the same
lons = df_filtered['Longitude']
lats = df_filtered['Latitude']
temps = df_filtered['MaxTemp']

# 6. Plot the locations with temperature labels
plt.figure(figsize=(10, 8))

# Create a Cartopy map with PlateCarree projection (for global lat-lon coordinates)
ax = plt.axes(projection=ccrs.PlateCarree())

# Add a natural coastline feature (cartopy feature)
ax.add_feature(cfeature.COASTLINE, edgecolor='black')
ax.add_feature(cfeature.BORDERS, linestyle=':')

# Add state boundaries for Australia
ax.add_feature(cfeature.STATES, linestyle=':', edgecolor='gray')

# Plot the location points on the map
sc = ax.scatter(lons, lats, c=temps, cmap='coolwarm', edgecolors='k', s=100)

# Add temperature labels at each location
for idx, row in df_filtered.iterrows():
    ax.text(row['Longitude'], row['Latitude'], f"{row['MaxTemp']}°C", fontsize=12, ha='center', color='black')

# Add a colorbar for the temperature scale
cbar = plt.colorbar(sc, ax=ax, label='Max Temperature (°C)')

# Add labels and title
plt.title("Max Temperature across Australia on 30 April 2012")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

# Show the plot
plt.show()



In [15]:
#interpoleeriv kaart

import geopandas as gpd
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import numpy as np
from scipy.interpolate import griddata
import cartopy.crs as ccrs
import cartopy.feature as cfeature

# 1. Filter data for a specific date
df_filtered = df[(df['Year'] == 2012) & (df['Month'] == 4) & (df['Day'] == 30)].copy()

# 2. Geocode the locations to get latitude and longitude
geolocator = Nominatim(user_agent="geo_plotting")

# Create lists to store latitudes and longitudes
lons = []
lats = []

for location in df_filtered['Location']:
    location_info = geolocator.geocode(location + ", Australia")
    if location_info:
        lons.append(location_info.longitude)
        lats.append(location_info.latitude)
    else:
        lons.append(np.nan)
        lats.append(np.nan)

# Add latitude and longitude to the filtered dataframe using .loc
df_filtered.loc[:, 'Longitude'] = lons
df_filtered.loc[:, 'Latitude'] = lats

# Drop rows with NaN coordinates or missing MaxTemp values
df_filtered = df_filtered.dropna(subset=['Longitude', 'Latitude', 'MaxTemp'])

# Extract data for interpolation
lons = df_filtered['Longitude'].values
lats = df_filtered['Latitude'].values
temps = df_filtered['MaxTemp'].values

# 3. Define a grid for interpolation
lon_min, lon_max = lons.min() - 1, lons.max() + 1
lat_min, lat_max = lats.min() - 1, lats.max() + 1
lon_grid, lat_grid = np.meshgrid(
    np.linspace(lon_min, lon_max, 200), 
    np.linspace(lat_min, lat_max, 200)
)

# 4. Interpolate temperature values onto the grid
temp_grid = griddata(
    points=(lons, lats), 
    values=temps, 
    xi=(lon_grid, lat_grid), 
    method='linear'
)

# 5. Plot the data
plt.figure(figsize=(12, 10))

# Create a Cartopy map with PlateCarree projection
ax = plt.axes(projection=ccrs.PlateCarree())

# Add map features
ax.add_feature(cfeature.COASTLINE, edgecolor='black')
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.add_feature(cfeature.STATES, linestyle=':', edgecolor='gray')

# Plot interpolated temperature as a contour map
contour = ax.contourf(
    lon_grid, lat_grid, temp_grid, 
    levels=20, cmap='coolwarm', transform=ccrs.PlateCarree()
)

# Add the original data points
sc = ax.scatter(lons, lats, c=temps, cmap='coolwarm', edgecolors='k', s=100, label='Data Points')

# Add a colorbar for the temperature scale
cbar = plt.colorbar(contour, ax=ax, orientation='vertical', label='Max Temperature (°C)')

# Add labels and title
plt.title("Interpolated Max Temperature across Australia on 30 April 2012")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

# Show the plot
plt.show()
