In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# Function to create sliding windows
def create_sliding_windows(data, window_size=10):
    windows = []
    labels = []

    for i in range(len(data) - window_size):
        window = data.iloc[i:i + window_size]
        label = data.iloc[i + window_size:i + window_size + 3]['AMBROSIA'].values  # Assuming 'AMBROSIA' is the target variable
        
        # Pad shorter arrays
        if len(label) < 3:
            label = np.pad(label, (0, 3 - len(label)))

        windows.append(window)
        labels.append(label)

    return windows, labels

# Assuming your CSV files are in the 'data/gradovi/' folder
folder_path = 'data/'

# List to store all windows and labels
all_windows = []
all_labels = []

# Iterate through each CSV file
for i in range(1, 19):  # Assuming you have 13 CSV files
    file_path = f'{folder_path}{i}.csv'
    df = pd.read_csv(file_path)

    # Extract relevant columns
    relevant_columns = ['AMBROSIA', 'tavg', 'wspd', 'pres','BG','VS','KS','KG','NIS','PO','SU','year','month','day']
    df = df[relevant_columns]

    # Create sliding windows for the current CSV file
    windows, labels = create_sliding_windows(df)

    # Append to the overall list
    all_windows.extend(windows)
    all_labels.extend(labels)

# Convert to numpy arrays
X = np.array(all_windows)
y = np.array(all_labels)



In [2]:
print(X.shape)
print(y.shape)


(4596, 10, 14)
(4596, 3)


In [3]:
import pandas as pd

# Load the new CSV file
file_path = 'data/folder/pollen_alltest.csv'  # Replace with the actual path
df_test = pd.read_csv(file_path)

# Extract relevant columns
relevant_columns = ['AMBROSIA', 'tavg', 'wspd', 'pres','BG','VS','KS','KG','NIS','PO','SU','year','month','day']
df_test = df_test[relevant_columns]

# Skip every 10 rows to get non-overlapping sequences
test_batches = [df_test[i:i + 10] for i in range(0, len(df_test), 10)]

# Convert to numpy arrays if needed
X_test_batches = np.array(test_batches)
print(X_test_batches.shape)

# Now you can use these X_test_batches for testing your model






(112, 10, 14)


In [4]:
from sklearn.preprocessing import MinMaxScaler

# Flatten the 3D input arrays to 2D for normalization
X_train_flat = X.reshape((X.shape[0], -1))
X_test_flat = X_test_batches.reshape((X_test_batches.shape[0], -1))

# Initialize the scaler
scaler = MinMaxScaler()

# Fit on training data and transform it
X_train_scaled = scaler.fit_transform(X_train_flat)

# Transform the test data (using the same scaler)
X_test_scaled = scaler.transform(X_test_flat)

# Reshape the scaled data back to 3D
X_train_scaled = X_train_scaled.reshape(X.shape)
X_test_scaled = X_test_scaled.reshape(X_test_batches.shape)

print(X_train_scaled.shape)
print(X_test_scaled.shape)


(4596, 10, 14)
(112, 10, 14)


In [5]:
print(y.shape)

(4596, 3)


In [6]:
#is there nan in X_train_scaled or y_train
print(np.isnan(X_train_scaled).any())
print(np.isnan(y).any())

False
False


In [7]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Assuming X_train_scaled has shape (num_samples, 10, 10, 1)
X_train_flat = X_train_scaled.reshape((X_train_scaled.shape[0], -1))

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_flat, y, test_size=0.2, random_state=42)

# Define the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.5, learning_rate=0.05,
                              max_depth=50, alpha=3, n_estimators=800)

# Train the model on the training set
model_xgb.fit(X_train, y_train)

# Predict on the training set
y_train_pred = model_xgb.predict(X_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
print(f'Mean Absolute Error on Training Data: {mae_train}')

# Predict on the validation set
y_val_pred = model_xgb.predict(X_val)
mae_val = mean_absolute_error(y_val, y_val_pred)
print(f'Mean Absolute Error on Validation Data: {mae_val}')


Mean Absolute Error on Training Data: 0.03473951479661653
Mean Absolute Error on Validation Data: 9.008944948487269


In [8]:
predictions = model_xgb.predict(X_test_scaled.reshape((X_test_scaled.shape[0], -1)))
#predictions = model.predict(X_test_scaled)
print(predictions)
predictions=predictions.astype(int)
print(predictions)

[[-8.61565489e-03 -7.71644199e-03 -3.67451971e-03]
 [-1.40096166e-03 -2.42642686e-03 -3.87306930e-03]
 [ 1.38437643e-03  4.62788111e-03  6.78497273e-03]
 [ 4.86460514e-02  4.38734144e-03  2.42167432e-03]
 [-9.01357830e-03 -8.13454855e-03 -4.74233460e-03]
 [ 3.96933526e-01  2.98931539e-01  4.85181957e-01]
 [ 2.92558342e-01  5.94412744e-01  9.56171095e-01]
 [ 4.32116091e-01  1.28534436e+00  1.96622670e-01]
 [ 7.16532469e-01  7.98790991e-01  8.08554351e-01]
 [ 1.62251842e+00  1.39751482e+00  5.38352394e+00]
 [ 5.22284355e+01  3.92543983e+01  6.54410172e+01]
 [ 7.39061050e+01  1.65700165e+02  1.18122566e+02]
 [ 1.94387726e+02  1.61696899e+02  1.51848770e+02]
 [ 5.12457561e+00  3.73092413e+00  8.65286541e+00]
 [ 2.42821193e+00  3.91876030e+00  3.09155226e+00]
 [ 2.52779984e+00  1.15300524e+00  1.14753819e+00]
 [ 5.71001461e-03  1.56972595e-02  1.89518575e-02]
 [-2.07950058e-03 -5.26531972e-03 -2.89033097e-03]
 [ 3.91605636e-03  1.21627627e-02  9.57107078e-03]
 [ 5.53755425e-02  1.29578561e-

In [9]:
ts= pd.read_csv('data/folder/pollen_alltest.csv')
ts=ts['batch_id'].unique()

In [10]:
print(predictions.shape)
print(ts.shape)

print(type(predictions))
print(type(ts))


(112, 3)
(112,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [11]:
arr1_reshaped = ts[:, np.newaxis]

# Concatenate arr1_reshaped and arr2 along the second axis (columns)
result = np.concatenate((arr1_reshaped, predictions), axis=1)

In [12]:
with open("nesto29.csv",'w') as f:
    f.write('batch_id,1 day prediction,2 days prediction,3 days prediction\n')
    for i in result:
        tmp = ''
        for j in i:
            tmp += str(j)+','
        tmp = tmp[:-1]
        f.write(tmp)
        f.write('\n')