In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# for work with data
import pandas as pd
import numpy as np
# graphs
import matplotlib.pyplot as plt
# for training and training's preparing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
# metrics
from sklearn.metrics import mean_squared_error
# other
import warnings
warnings.filterwarnings('ignore')

# 1. Import Data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e20/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e20/test.csv')
samsub = pd.read_csv('/kaggle/input/playground-series-s3e20/sample_submission.csv')

I don't see any obvious point in adding other columns because sometimes 20% of the rows are missing, so I'll leave only "clean" columns.

In [None]:
# take just necessary columns
train_df = train[['latitude', 'longitude', 'year', 'week_no', 'emission']]
test_df = test[['latitude', 'longitude', 'year', 'week_no']]

# display the datasets
display(train_df.head(3))
display(test_df.head(3))

# 2. Data Preprocessing

Obviously, the graphs are very similar, it is necessary to check the trend of this time series in order to understand how different these data are over the years, and this in turn will help us choose the optimal model for training.

In [None]:
# create a new figure for 3 plots
fig, ax = plt.subplots(3)
# create 3 graphs for every year in data
ax[0].plot(train_df[train_df.year == 2019]['emission']) 
ax[1].plot(train_df[train_df.year == 2020]['emission']) 
ax[2].plot(train_df[train_df.year == 2021]['emission']) 
# show plots
plt.tight_layout()

The MSE for each year is no more than 400, which means that the series are really similar not only visually, but also numerically. It should be mentioned right away that there was a crisis in 2020 due to the coronavirus, which is why the MSE is so large this year, so this fact confirms that there are no special fluctuations from year to year. Therefore, in the training data I will use both 2019 and 2021, or only 2021 - I will rely on the results of experiments.

In [None]:
print('MSE(2019/2020): ', mean_squared_error(train_df[train_df.year == 2019]['emission'], train_df[train_df.year == 2020]['emission']))
print('MSE(2019/2021): ', mean_squared_error(train_df[train_df.year == 2019]['emission'], train_df[train_df.year == 2021]['emission']))
print('MSE(2020/2021): ', mean_squared_error(train_df[train_df.year == 2020]['emission'], train_df[train_df.year == 2021]['emission']))

There are also week numbers in the source data, but there are fewer weeks in the training sample, as can be seen below.

In [None]:
# create a figure
fig, ax = plt.subplots()
# take week's columns from train and test datasets
ax.bar(['Train', 'Test'], height=[len(train_df['week_no'].unique()), len(test_df['week_no'].unique())])
ax.set_title('How many weeks in dataset')
plt.show()

So we will leave the values for all weeks except 49, 50, 51 and 52. This way we will improve the predictions for the remaining weeks.

In [None]:
# take data just for 2021 year
train_df = train_df[train_df.year == 2021]
# drop useless raws for the training
train_df = train_df[train_df['week_no'] != 49][train_df['week_no'] != 50][train_df['week_no'] != 51][train_df['week_no'] != 52]

Now we are preparing the data for training.

In [None]:
# split the data on train and test datasets
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['emission'], axis=1), train_df['emission'], test_size=0.1, random_state=42)

# 3. Model Training

For training, I will choose RandomForestRegressor, because: firstly, half of the columns in the data are categorical data, and secondly, RFG is great for values with trends.

In [None]:
# create a new regression model
reg = RandomForestRegressor()
# train the regression model
reg.fit(X_train, y_train)

# 4. Prediction

To test the model before predicting the test sample, we use X_test.

In [None]:
# get the predictions
preds = reg.predict(X_test)
# print MSE between the datasets
print(mean_squared_error(preds, y_test))
# plot the preds
pd.Series(preds).plot();

As we can see from the chart: the short-term trend and periodicity are preserved. Now let's prepare a table with predictions and save them to the file 'submission.csv'.

In [None]:
# get the final predictions
sub = reg.predict(test_df)
# rewrite the 'emission' column with the predictions
samsub['emission'] = np.around(sub+1, 6)
# save the file
samsub.to_csv('submission.csv', index=False)

### Thanks for your attemption!