In [1]:
import pandas as pd
import numpy as np
import cloupy as cl

# Data download & cleaning

In [None]:
df = cl.d_imgw_data('prompt', 'synop', years_range=range(2007, 2023))

In [11]:
df = df.iloc[:, [1, 3, 5, 29, 25, 41]]
df.columns = ['station', 'month', 'hour', 'temp', 'ws', 'slp']

In [13]:
df.to_csv('meteo_data.csv')

# Data preparation

In [2]:
df = pd.read_csv('meteo_data.csv', index_col=0)

In [559]:
stations = ['KASPROWY WIERCH', 'WROCŁAW', 'SUWAŁKI', 'POZNAŃ', 'WARSZAWA', 'SZCZECIN']
df = df[df['station'].isin(stations)]

In [560]:
df

Unnamed: 0,station,month,hour,temp,ws,slp
0,SUWAŁKI,1,0,5.3,8,990.0
1,SUWAŁKI,1,1,5.3,6,990.2
2,SUWAŁKI,1,2,5.1,5,989.9
3,SUWAŁKI,1,3,4.8,6,988.9
4,SUWAŁKI,1,4,4.4,7,988.3
...,...,...,...,...,...,...
8755,KASPROWY WIERCH,12,19,5.3,15,804.9
8756,KASPROWY WIERCH,12,20,5.2,12,805.6
8757,KASPROWY WIERCH,12,21,5.7,11,806.4
8758,KASPROWY WIERCH,12,22,5.0,10,806.5


# Class

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

class LRModel:
    def __init__(self, df):
        self.df = df
        self.filtered_df = None
        self.stations = None
        
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        
        self.found_C = None
        self.found_solver = None
        self.found_max_iter = None
        
    def select_stations(self, stations):
        filtered_df = df[df['station'].isin(stations)]
        
        self.filtered_df = filtered_df
        self.stations = stations
        print(f'Unique stations in the filtered dataframe: {self.filtered_df.station.unique()}\n')
        
    def create_train_and_test_set(self, test_size=0.2):  
        target = self.filtered_df.station.to_numpy()
        variables = self.filtered_df.drop('station', axis=1).to_numpy()
        
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(variables, target, test_size=test_size)
        
    def standardize_data(self):
        scaler = StandardScaler()
        self.x_train = scaler.fit_transform(self.x_train)
        self.x_test = scaler.transform(self.x_test)
    
    def find_hyperparameters(self, C=[0.0, 0.275, 0.55, 0.825, 1.1], solver=['saga', 'lbfgs'], max_iter=[100, 1000, 2500, 5000, 10000]):
        param_grid = [
            {'C': C,
             'solver': solver,
             'max_iter': max_iter
            }
        ]
        
        clf = GridSearchCV(LogisticRegression(), param_grid=param_grid, cv=3, verbose=3, n_jobs=-1)
        best_clf = clf.fit(self.x_train, self.y_train)

        self.found_C = best_clf.best_params_['C']
        self.found_solver = best_clf.best_params_['solver']
        self.found_max_iter = best_clf.best_params_['max_iter']
        
        print(f'Found hyperparameters: {best_clf.best_params_}')
        
    def fit(self, solver=None, max_iter=None, C=None):
        if solver is None:
            solver = self.found_solver
        if max_iter is None:
            max_iter = self.found_max_iter
        if C is None:
            C = self.found_C
        
        log_reg = LogisticRegression(solver=solver, max_iter=max_iter, C=C, n_jobs=-1)
        log_reg.fit(self.x_train, self.y_train)
        
        self.model = log_reg
        
    def evaluate(self):
        scor = self.model.score(self.x_test, self.y_test)
        f1_scor = f1_score(self.y_test, self.model.predict(self.x_test), labels=self.stations, average=None)
        print( 
            f'Score [%]: {round(scor, 2)}\n'
            f'F1 Score [%]: {[(self.stations[i], round(f1_scor[i], 2)) for i in range(len(self.stations))]}'
             )
    

# A. 3 stations

### A.1. KASPROWY WIERCH, SUWAŁKI, SZCZECIN: these are 3 stations with a completely different climate (alpine, more continental, more oceanic)

In [711]:
from sklearn.model_selection import GridSearchCV

LR = LRModel(df)

In [712]:
selected_stations = ['KASPROWY WIERCH', 'SUWAŁKI', 'SZCZECIN']

LR.select_stations(selected_stations)
LR.create_train_and_test_set()
LR.standardize_data()

LR.find_hyperparameters()
LR.fit()
LR.evaluate()

Unique stations in the filtered dataframe: ['SUWAŁKI' 'SZCZECIN' 'KASPROWY WIERCH']

Fitting 3 folds for each of 50 candidates, totalling 150 fits


30 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Python\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1589, in fit
    fold_coefs_ = Parallel(
  File "D:\Python\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "D:\Python\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "D:\Python\lib\site-packages\joblib\parallel.py", line 779, in _disp

Found hyperparameters: {'C': 0.55, 'max_iter': 10000, 'solver': 'saga'}
Score [%]: 0.94
F1 Score [%]: [('KASPROWY WIERCH', 1.0), ('SUWAŁKI', 0.9), ('SZCZECIN', 0.9)]


#### A.1. Quick summary
- In case of completely different climates, logistic regression works really well - **mean accuracy: 94%; F1 Score: above 90% for every station**
- I will use above hyperparameters for other sets of stations

### A.2. KASPROWY WIERCH, SUWAŁKI, POZNAŃ: these are also 3 stations with a different climates, but the climate of POZNAŃ is not as oceanic as the climate of SZCZECIN

In [713]:
selected_stations = ['KASPROWY WIERCH', 'SUWAŁKI', 'POZNAŃ']

LR.select_stations(selected_stations)
LR.create_train_and_test_set()
LR.standardize_data()

LR.fit()
LR.evaluate()

Unique stations in the filtered dataframe: ['SUWAŁKI' 'POZNAŃ' 'KASPROWY WIERCH']

Score [%]: 0.86
F1 Score [%]: [('KASPROWY WIERCH', 1.0), ('SUWAŁKI', 0.79), ('POZNAŃ', 0.79)]


#### A.2. Quick summary

- In this set of stations, the results are still good - not as good as in the A.1., but still
- The result is worse a little, because the station in POZNAŃ and the station in SUWAŁKI are not as different as it was in the previous example

### A.3. KASPROWY WIERCH, WARSZAWA, POZNAŃ: the stations in WARSZAWA and POZNAŃ have more similar climates, which should be noticable in the model performance

In [717]:
selected_stations = ['KASPROWY WIERCH', 'WARSZAWA', 'POZNAŃ']

LR.select_stations(selected_stations)
LR.create_train_and_test_set()
LR.standardize_data()

LR.fit()
LR.evaluate()

Unique stations in the filtered dataframe: ['POZNAŃ' 'WARSZAWA' 'KASPROWY WIERCH']

Score [%]: 0.71
F1 Score [%]: [('KASPROWY WIERCH', 1.0), ('WARSZAWA', 0.56), ('POZNAŃ', 0.57)]


#### A.3. Quick summary

- As expected, the results are much worse due to high similarity of climates in POZNAŃ and WARSZAWA

### A.4. WROCŁAW, WARSZAWA, POZNAŃ: climate differences in the given 3 stations are not as clear as in the previous cases - the model performance should be poor

In [718]:
selected_stations = ['WROCŁAW', 'WARSZAWA', 'POZNAŃ']

LR.select_stations(selected_stations)
LR.create_train_and_test_set()
LR.standardize_data()

LR.fit()
LR.evaluate()

Unique stations in the filtered dataframe: ['POZNAŃ' 'WARSZAWA' 'WROCŁAW']

Score [%]: 0.41
F1 Score [%]: [('WROCŁAW', 0.47), ('WARSZAWA', 0.18), ('POZNAŃ', 0.48)]


#### A.4. Quick summary

- As expected, the model performance is really poor due to high similarity of climates in the given stations
- The model performance is not much better than random picking from the set of 3 stations

# B. 6 stations

In [721]:
selected_stations = ['KASPROWY WIERCH', 'WROCŁAW', 'SUWAŁKI', 'POZNAŃ', 'WARSZAWA', 'SZCZECIN']

LR.select_stations(selected_stations)
LR.create_train_and_test_set()
LR.standardize_data()

LR.fit()
LR.evaluate()

Unique stations in the filtered dataframe: ['SUWAŁKI' 'SZCZECIN' 'POZNAŃ' 'WARSZAWA' 'WROCŁAW' 'KASPROWY WIERCH']

Score [%]: 0.51
F1 Score [%]: [('KASPROWY WIERCH', 1.0), ('WROCŁAW', 0.33), ('SUWAŁKI', 0.58), ('POZNAŃ', 0.27), ('WARSZAWA', 0.14), ('SZCZECIN', 0.61)]


In [728]:
LR.df.station.value_counts()

POZNAŃ             140256
WARSZAWA           140256
WROCŁAW            140256
KASPROWY WIERCH    140256
SZCZECIN           140255
SUWAŁKI            140252
Name: station, dtype: int64

In [725]:
LR.df.isnull().any()

station    False
month      False
hour       False
temp       False
ws         False
slp        False
dtype: bool

#### B. Quick summary

- The more different climate is, the highest F1 Score it gets - i.e. KASPROWY WIERCH's climate is unique (it's an alpine station), so the model does not have problems with recognizing this station
- The less different climate is, the lowest F1 Score it gets - i.e. WARSZAWA, POZNAŃ and WROCŁAW have relatively similar climates, so the model does have problems with differentiate these stations
- SZCZECIN and SUWAŁKI climates are marked by relatively different climates comparing to the other stations, so the model has much less problems with recognition

# C. All stations from the dataset

As it can be seen in the above sections - the more unique climate of the station is, the better performance linear regression model gets. KASPROWY WIERCH is an alpine station, so there is no doubt about it's uniqueness.

It is worth to note that the more stations in the training dataset are, then a single station has less probability to be unique. **What will happen if we extend the dataset? What will happen if we add another alpine-like stations? The model performance should be much weaker and it should be more problematic to recognize KASPROWY WIERCH.**

### Data preparation

In [5]:
df = pd.read_csv('meteo_data.csv', index_col=0)

In [6]:
stations = ['KASPROWY WIERCH', 'WROCŁAW', 'SUWAŁKI', 'POZNAŃ', 'WARSZAWA', 'SZCZECIN', 'ŚNIEŻKA', 'HEL', 'ZAKOPANE', 'CHOJNICE']
df = df[df['station'].isin(stations)]

In [7]:
df.isna().any()

station    False
month      False
hour       False
temp       False
ws         False
slp        False
dtype: bool

In [8]:
df.describe().round(2)

Unnamed: 0,month,hour,temp,ws,slp
count,1402554.0,1402554.0,1402554.0,1402554.0,1402554.0
mean,6.52,11.5,7.41,4.41,957.47
std,3.45,6.92,8.97,3.94,76.89
min,1.0,0.0,-28.5,0.0,0.0
25%,4.0,6.0,1.1,2.0,918.3
50%,7.0,12.0,7.1,3.0,996.8
75%,10.0,17.75,14.0,5.0,1007.9
max,12.0,23.0,37.7,60.0,1048.0


The station air pressure (slp) can not be equal to 0 - if it is, it means that there is a missing value

In [10]:
(df.slp == 0).sum()

133

In [11]:
df.shape

(1402554, 6)

In [12]:
df = df[df.slp > 0.1]

In [13]:
df.describe().round(2)

Unnamed: 0,month,hour,temp,ws,slp
count,1402421.0,1402421.0,1402421.0,1402421.0,1402421.0
mean,6.52,11.5,7.41,4.41,957.56
std,3.45,6.92,8.97,3.94,76.33
min,1.0,0.0,-28.5,0.0,759.3
25%,4.0,6.0,1.1,2.0,918.3
50%,7.0,12.0,7.1,3.0,996.8
75%,10.0,17.0,14.0,5.0,1007.9
max,12.0,23.0,37.7,60.0,1048.0


### Fitting

In [18]:
LR = LRModel(df)
selected_stations = list(df.station.unique())

LR.select_stations(selected_stations)
LR.create_train_and_test_set()
LR.standardize_data()

LR.fit(solver='saga', max_iter=10000, C=0.55)
LR.evaluate()

Unique stations in the filtered dataframe: ['HEL' 'SUWAŁKI' 'SZCZECIN' 'CHOJNICE' 'POZNAŃ' 'WARSZAWA' 'WROCŁAW'
 'ŚNIEŻKA' 'ZAKOPANE' 'KASPROWY WIERCH']

Score [%]: 0.52
F1 Score [%]: [('HEL', 0.36), ('SUWAŁKI', 0.43), ('SZCZECIN', 0.41), ('CHOJNICE', 0.26), ('POZNAŃ', 0.22), ('WARSZAWA', 0.12), ('WROCŁAW', 0.3), ('ŚNIEŻKA', 1.0), ('ZAKOPANE', 1.0), ('KASPROWY WIERCH', 1.0)]


#### C. Quick Summary

- The interesting thing is that adding more alpine-like stations (ŚNIEŻKA, ZAKOPANE) did not effect the model performance for KASPROWY WIERCH
- What is more, the new alpine-like stations also have high F1 Score - all of them scored 100%
- The new alpine-like stations seems to be again highly unique in comparision to the other stations. **One of the model variable is a station air pressure (not standarized to the sea level) - this may make it easy to guess if the given data comes from one of the alpine-like stations.** On the other hand, the air pressure variable is not that helpful in case of non-alpine stations.
- The lowest F1 Score got POZNAŃ, CHOJNICE, WARSZAWA, WROCŁAW (below 30%). These stations are the least unique and it may be impossible to guess from which of the given stations the data comes from - for a human-being, it would be also challenging
- Relatively low F1 Score got also HEL, SZCZECIN and SUWAŁKI; however these stations have some unique characteristics which make them easier to guess - HEL is a marine station, SZCZECIN's climate is more oceanic and SUWAŁKI's climate is more continental 

# Summary

- Logistic regression works well for specific stations, especially when pretty unique stations are chosen
- Logistic regression have problems with recognition when there are stations with relatively similar climates - it is not surprising, a climatologist would also have problems with recognizing the stations properly on the basis of chosen variables
- To improve the model performance, we could use more data for each station and use more variables (i.e. precipitation, cloud cover, wind direction etc.). Changing the model hyperparameters should not improve the model performance by much - the problem here is the amount of provided data and the data's nature