In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'temperature-forecasting-for-localized-weather-stat:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F84975%2F9562896%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240923%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240923T160417Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D320183542c8be6c6c08b1da5fa29de2f0d9d2572b2b5b27f689c1536decf24b9e462568825f8b0e9db9733c5f34fda84c747bd1ffd30d94b6852a0560514895c3c6b687af0afafa5bc3c8a73aba809f15b6e8671d8aa0743cf2888a5287520aae87241eb6316842fa8515b8d3c9cc8aaf2c77b8364abccd922c378c236254b986f33c961d1e0e53d3e70fed8f00e9e74e6d803da85db151311e1274180aa91d0a02368c3590d16dc206328928dda29ae05d2be8453ba99e63c0d906945e124302729bf80a265549cac93edcf3e20eec09b53fe0a8643e23d7dc47f3c57f847104a894a9c635d837c8041acfdfa92ac70858730374e461fd600d31d898e9c7c24'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/temperature-forecasting-for-localized-weather-stat/sample_submission.csv
/kaggle/input/temperature-forecasting-for-localized-weather-stat/train.csv
/kaggle/input/temperature-forecasting-for-localized-weather-stat/test.csv


In [None]:
from warnings import filterwarnings
from sklearn.pipeline import Pipeline

filterwarnings(action='ignore', category=FutureWarning)

In [None]:
train_df = pd.read_csv('/kaggle/input/temperature-forecasting-for-localized-weather-stat/train.csv')

In [None]:
train_df.head()

Unnamed: 0,id,mac,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,time,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed,temp
0,0,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-07-08 19:00:00+07:00,89.9,5.0,6.0,5.0,0.0,180.0,6.1,26.925
1,1,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.71299,98.573417,2022-07-18 00:00:00+07:00,80.8,0.0,8.0,7.0,0.0,135.0,1.4,27.25
2,2,30AEA4F7AE44,โรงเรียนสามเสนนอก(ประชาราษฎร์อนุกูล),102601,ดินแดง,1026,ดินแดง,10,กรุงเทพมหานคร,13.777972,100.569662,2022-06-19 11:00:00+07:00,60.8,51.0,,,0.0,270.0,3.9,34.075
3,3,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.71299,98.573417,2022-06-18 03:00:00+07:00,87.1,0.0,6.0,5.0,0.0,135.0,0.0,26.5
4,4,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-07-28 04:00:00+07:00,86.4,3.0,3.0,1.0,0.0,22.5,0.0,28.6


In [None]:
train_df.columns

Index(['id', 'mac', 'station_name', 'tambon_code', 'tambon_namt',
       'amphur_code', 'amphur_namt', 'province_code', 'province_namt',
       'latitude', 'longitude', 'time', 'humid', 'light', 'pm10', 'pm2.5',
       'rainfall', 'wind_direct', 'wind_speed', 'temp'],
      dtype='object')

In [None]:
train_df[['time', 'humid', 'light', 'pm10', 'pm2.5',
       'rainfall', 'wind_direct', 'wind_speed', 'temp']].describe()

Unnamed: 0,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed,temp
count,9543.0,9543.0,4791.0,4791.0,9543.0,9543.0,9543.0,9543.0
mean,80.468417,33.329771,7.640367,6.435191,0.128555,160.494341,5.468364,29.628104
std,12.444779,34.501063,9.043012,9.017724,1.052061,100.883976,217.140148,4.59171
min,41.1,0.0,1.0,0.0,0.0,0.0,0.0,17.775
25%,71.1,0.0,3.0,2.0,0.0,45.0,0.0,26.1
50%,82.6,17.0,6.0,4.0,0.0,180.0,1.4,28.575
75%,90.5,68.0,10.0,9.0,0.0,225.0,3.3,32.7125
max,100.0,96.0,368.0,368.0,28.2,337.5,15094.3,51.1


Outliner: Light, PM10, PM2.5

In [None]:
tmp_train = train_df.copy()

In [None]:
def remove_outliers(df):
    numeric_df = df.select_dtypes(include=['number'])  # Select only numeric columns
    for column in numeric_df.columns:
        Q1 = numeric_df[column].quantile(0.25)
        Q3 = numeric_df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

tmp_train = remove_outliers(tmp_train)
tmp_train

Unnamed: 0,id,mac,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,time,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed,temp
0,0,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-07-08 19:00:00+07:00,89.9,5.0,6.0,5.0,0.0,180.0,6.1,26.925
1,1,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.712990,98.573417,2022-07-18 00:00:00+07:00,80.8,0.0,8.0,7.0,0.0,135.0,1.4,27.250
3,3,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.712990,98.573417,2022-06-18 03:00:00+07:00,87.1,0.0,6.0,5.0,0.0,135.0,0.0,26.500
4,4,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-07-28 04:00:00+07:00,86.4,3.0,3.0,1.0,0.0,22.5,0.0,28.600
5,5,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-05-03 20:00:00+07:00,59.4,2.0,4.0,2.0,0.0,45.0,1.6,27.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9521,9521,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-07-10 22:00:00+07:00,88.0,0.0,2.0,1.0,0.0,45.0,3.6,26.700
9522,9522,3C71BF17CDBC,โรงเรียนบ้านนา,550404,สถาน,5504,นาน้อย,55,น่าน,18.241106,100.690577,2022-07-11 19:00:00+07:00,87.4,0.0,7.0,5.0,0.0,225.0,0.0,27.700
9530,9530,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-05-12 18:00:00+07:00,87.0,24.0,15.0,14.0,0.0,180.0,2.7,28.575
9531,9531,3C71BF15DB04,บ้านสำโรงเกียรติ_2,330802,บักดอง,3308,ขุนหาญ,33,ศรีสะเกษ,14.533026,104.495172,2022-05-14 05:00:00+07:00,91.2,0.0,10.0,9.0,0.0,315.0,0.7,26.975


In [None]:
tmp_train.head()

Unnamed: 0,id,mac,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,time,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed,temp
0,0,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-07-08 19:00:00+07:00,89.9,5.0,6.0,5.0,0.0,180.0,6.1,26.925
1,1,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.71299,98.573417,2022-07-18 00:00:00+07:00,80.8,0.0,8.0,7.0,0.0,135.0,1.4,27.25
3,3,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.71299,98.573417,2022-06-18 03:00:00+07:00,87.1,0.0,6.0,5.0,0.0,135.0,0.0,26.5
4,4,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-07-28 04:00:00+07:00,86.4,3.0,3.0,1.0,0.0,22.5,0.0,28.6
5,5,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-05-03 20:00:00+07:00,59.4,2.0,4.0,2.0,0.0,45.0,1.6,27.0


In [None]:
tmp_train.index = tmp_train['id']

In [None]:
tmp_train = tmp_train.drop(['id'], axis=1)

In [None]:
tmp_train.columns

Index(['mac', 'station_name', 'tambon_code', 'tambon_namt', 'amphur_code',
       'amphur_namt', 'province_code', 'province_namt', 'latitude',
       'longitude', 'time', 'humid', 'light', 'pm10', 'pm2.5', 'rainfall',
       'wind_direct', 'wind_speed', 'temp'],
      dtype='object')

In [None]:
cols = ['mac', 'time', 'humid', 'light', 'pm10', 'pm2.5',
       'wind_direct', 'wind_speed', 'temp']

tmp_train = tmp_train[cols]

In [None]:
tmp_train['time'] = pd.to_datetime(tmp_train['time'])
tmp_train['hour'] = tmp_train.loc[:, 'time'].dt.hour
tmp_train['day_of_week'] = tmp_train.loc[:, 'time'].dt.dayofweek
tmp_train['month'] = tmp_train.loc[:, 'time'].dt.month
tmp_train = tmp_train.drop(['time'], axis=1)

In [None]:
tmp_train.head()

Unnamed: 0_level_0,mac,humid,light,pm10,pm2.5,wind_direct,wind_speed,temp,hour,day_of_week,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,3C71BF164F90,89.9,5.0,6.0,5.0,180.0,6.1,26.925,19,4,7
1,3C71BF1B1E28,80.8,0.0,8.0,7.0,135.0,1.4,27.25,0,0,7
3,3C71BF1B1E28,87.1,0.0,6.0,5.0,135.0,0.0,26.5,3,5,6
4,807D3AF57920,86.4,3.0,3.0,1.0,22.5,0.0,28.6,4,3,7
5,807D3AF57920,59.4,2.0,4.0,2.0,45.0,1.6,27.0,20,1,5


In [None]:
tmp_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4143 entries, 0 to 9542
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mac          4143 non-null   object 
 1   humid        4143 non-null   float64
 2   light        4143 non-null   float64
 3   pm10         4143 non-null   float64
 4   pm2.5        4143 non-null   float64
 5   wind_direct  4143 non-null   float64
 6   wind_speed   4143 non-null   float64
 7   temp         4143 non-null   float64
 8   hour         4143 non-null   int32  
 9   day_of_week  4143 non-null   int32  
 10  month        4143 non-null   int32  
dtypes: float64(7), int32(3), object(1)
memory usage: 339.9+ KB


In [None]:
tmp_train.isna().sum()

mac            0
humid          0
light          0
pm10           0
pm2.5          0
wind_direct    0
wind_speed     0
temp           0
hour           0
day_of_week    0
month          0
dtype: int64

In [None]:
tmp_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4143 entries, 0 to 9542
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mac          4143 non-null   object 
 1   humid        4143 non-null   float64
 2   light        4143 non-null   float64
 3   pm10         4143 non-null   float64
 4   pm2.5        4143 non-null   float64
 5   wind_direct  4143 non-null   float64
 6   wind_speed   4143 non-null   float64
 7   temp         4143 non-null   float64
 8   hour         4143 non-null   int32  
 9   day_of_week  4143 non-null   int32  
 10  month        4143 non-null   int32  
dtypes: float64(7), int32(3), object(1)
memory usage: 339.9+ KB


In [None]:
dummies = pd.get_dummies(tmp_train['mac'], prefix='mac')
tmp_train = pd.concat([tmp_train, dummies], axis=1)
tmp_train.drop(columns=['mac'], inplace=True)

In [None]:
bool_cols = tmp_train.select_dtypes(include='bool')
tmp_train[bool_cols.columns] = bool_cols.astype(int)

In [None]:
tmp_train.head()

Unnamed: 0_level_0,humid,light,pm10,pm2.5,wind_direct,wind_speed,temp,hour,day_of_week,month,mac_30AEA4F7AE44,mac_3C71BF15DB04,mac_3C71BF164F90,mac_3C71BF17CDBC,mac_3C71BF18CEA4,mac_3C71BF18EA64,mac_3C71BF1B1E28,mac_807D3AF57920
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,89.9,5.0,6.0,5.0,180.0,6.1,26.925,19,4,7,0,0,1,0,0,0,0,0
1,80.8,0.0,8.0,7.0,135.0,1.4,27.25,0,0,7,0,0,0,0,0,0,1,0
3,87.1,0.0,6.0,5.0,135.0,0.0,26.5,3,5,6,0,0,0,0,0,0,1,0
4,86.4,3.0,3.0,1.0,22.5,0.0,28.6,4,3,7,0,0,0,0,0,0,0,1
5,59.4,2.0,4.0,2.0,45.0,1.6,27.0,20,1,5,0,0,0,0,0,0,0,1


In [None]:
df = tmp_train.copy()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

cols = df.columns.to_list()
cols.remove('temp') #Label

target_col = 'temp'

X = df[cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


rf = RandomForestRegressor()


param_distributions = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_depth': [int(x) for x in np.linspace(10, 100, num=10)] + [None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}


random_search_v3 = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                                   n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1, scoring='neg_root_mean_squared_error')


random_search_v3.fit(X_train, y_train)


print("Best parameters found: ", random_search_v3.best_params_)
print("Best MSE found: ", -random_search_v3.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=15, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   1.1s
[CV] END bootstrap=True, max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=15, n_estimators=200; total time=   2.1s
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=6, min_samples_split=5, n_estimators=500; total time=   6.8s
[CV] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=6, min_samples_split=10, n_estimators=700; total time=   7.1s
[CV] END bootstrap=False, max_depth=10, max_features=log2, min_samples_leaf=6, min_samples_split=5, n_estimators=900; total time=   4.5s
[CV] END bootstrap=False, max_depth=70, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators

In [None]:
test_df = pd.read_csv('/kaggle/input/temperature-forecasting-for-localized-weather-stat/test.csv')

In [None]:
pm_df = test_df.copy()

In [None]:
pm_df.index = pm_df['id']
pm_df = pm_df.drop(['id'], axis=1)

In [None]:
pm_df.head()

Unnamed: 0_level_0,mac,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,time,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9543,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.71299,98.573417,2022-06-04 09:00:00+07:00,72.2,84.0,3.0,2.0,0.0,90.0,4.6
9544,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-06-24 15:00:00+07:00,76.8,66.0,,,0.0,225.0,1.5
9545,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-06-15 07:00:00+07:00,90.4,27.0,4.0,3.0,2.3,0.0,0.0
9546,3C71BF18CEA4,โรงเรียนหนองสูงสามัคคีวิทยา_2,490706,หนองสูงเหนือ,4907,หนองสูง,49,มุกดาหาร,16.494229,104.350891,2022-05-12 04:00:00+07:00,97.9,0.0,,,0.0,45.0,0.0
9547,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-06-04 12:00:00+07:00,91.3,43.0,,,0.0,45.0,1.2


In [None]:
pm_df['mac'].unique()

array(['3C71BF1B1E28', '3C71BF164F90', '807D3AF57920', '3C71BF18CEA4',
       '3C71BF15DB04', '30AEA4F7AE44', '3C71BF17CDBC', '3C71BF18EA64'],
      dtype=object)

In [None]:
for m in pm_df['mac'].unique():
    pm10 = float(round(pm_df[pm_df['mac']==m]['pm10'].mean()))
    pm25 = float(round(pm_df[pm_df['mac']==m]['pm2.5'].mean()))

    pm_df['pm10'].fillna(pm10, inplace=True)
    pm_df['pm2.5'].fillna(pm25, inplace=True)

In [None]:
pm_df.isna().sum()

mac              0
station_name     0
tambon_code      0
tambon_namt      0
amphur_code      0
amphur_namt      0
province_code    0
province_namt    0
latitude         0
longitude        0
time             0
humid            0
light            0
pm10             0
pm2.5            0
rainfall         0
wind_direct      0
wind_speed       0
dtype: int64

In [None]:
test_data = pm_df

In [None]:
test_data

Unnamed: 0_level_0,mac,station_name,tambon_code,tambon_namt,amphur_code,amphur_namt,province_code,province_namt,latitude,longitude,time,humid,light,pm10,pm2.5,rainfall,wind_direct,wind_speed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
9543,3C71BF1B1E28,โรงเรียนสรรพวิทยาคม,630601,แม่สอด,6306,แม่สอด,63,ตาก,16.712990,98.573417,2022-06-04 09:00:00+07:00,72.2,84.0,3.0,2.0,0.0,90.0,4.6
9544,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-06-24 15:00:00+07:00,76.8,66.0,10.0,9.0,0.0,225.0,1.5
9545,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-06-15 07:00:00+07:00,90.4,27.0,4.0,3.0,2.3,0.0,0.0
9546,3C71BF18CEA4,โรงเรียนหนองสูงสามัคคีวิทยา_2,490706,หนองสูงเหนือ,4907,หนองสูง,49,มุกดาหาร,16.494229,104.350891,2022-05-12 04:00:00+07:00,97.9,0.0,10.0,9.0,0.0,45.0,0.0
9547,3C71BF164F90,โรงเรียนท่าข้ามวิทยา,860206,ท่าข้าม,8602,ท่าแซะ,86,ชุมพร,10.579849,99.113146,2022-06-04 12:00:00+07:00,91.3,43.0,10.0,9.0,0.0,45.0,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13629,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-05-13 09:00:00+07:00,62.3,78.0,10.0,9.0,0.0,225.0,8.6
13630,807D3AF57920,บ้านนาสะแบง_2,380704,นาสะแบง,3807,ศรีวิไล,38,บึงกาฬ,18.142499,103.806521,2022-06-25 14:00:00+07:00,69.6,88.0,2.0,1.0,0.0,0.0,0.0
13631,3C71BF17CDBC,โรงเรียนบ้านนา,550404,สถาน,5504,นาน้อย,55,น่าน,18.241106,100.690577,2022-05-18 21:00:00+07:00,95.3,0.0,10.0,9.0,0.0,0.0,0.0
13632,3C71BF18EA64,บ้านนา_2,300903,กำปัง,3009,โนนไทย,30,นครราชสีมา,15.112831,102.052114,2022-07-07 02:00:00+07:00,85.8,0.0,10.0,9.0,0.0,157.5,0.0


In [None]:
dummies = pd.get_dummies(test_data['mac'], prefix='mac')
test_data = pd.concat([test_data, dummies], axis=1)
test_data.drop(columns=['mac'], inplace=True)

bool_cols = test_data.select_dtypes(include='bool')
test_data[bool_cols.columns] = bool_cols.astype(int)

In [None]:
test_data['time'] = pd.to_datetime(test_data['time'])
test_data['hour'] = test_data.loc[:, 'time'].dt.hour
test_data['day_of_week'] = test_data.loc[:, 'time'].dt.dayofweek
test_data['month'] = test_data.loc[:, 'time'].dt.month
test_data = test_data.drop(['time'], axis=1)

In [None]:
X = test_data[cols]

In [None]:
best_rf = random_search_v3.best_estimator_
y_pred_result = best_rf.predict(X)

In [None]:
y_pred_result

array([31.02455556, 31.266     , 26.03411111, ..., 24.34244444,
       26.89122222, 32.76577778])

In [None]:
import math

def round_custom(value):
    # Multiply by 1000 to shift decimal 3 places to the right
    temp = value * 1000

    # Get the decimal part
    decimal_part = temp - math.floor(temp)

    if decimal_part >= 0.5:
        # Round up if 4th decimal place is 5 or more
        return math.ceil(temp) / 1000
    else:
        # Round down if 4th decimal place is less than 5
        return math.floor(temp) / 1000

In [None]:
s_pred_result = pd.Series(y_pred_result)
rounded_s_pred = s_pred_result.apply(round_custom)

In [None]:
rounded_s_pred

0       31.025
1       31.266
2       26.034
3       24.348
4       27.148
         ...  
4086    35.537
4087    33.241
4088    24.342
4089    26.891
4090    32.766
Length: 4091, dtype: float64

In [None]:
test_df.shape

(4091, 19)

In [None]:
submit = pd.read_csv('/kaggle/input/temperature-forecasting-for-localized-weather-stat/sample_submission.csv')

In [None]:
submit_phone = submit.copy()

In [None]:
submit_phone['temp'] = rounded_s_pred

In [None]:
submit_phone.head()

Unnamed: 0,id,temp
0,9543,31.025
1,9544,31.266
2,9545,26.034
3,9546,24.348
4,9547,27.148


In [None]:
submit_phone.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
check_submit = pd.read_csv('/kaggle/working/submission.csv')
check_submit

Unnamed: 0,id,temp
0,9543,31.025
1,9544,31.266
2,9545,26.034
3,9546,24.348
4,9547,27.148
...,...,...
4086,13629,35.537
4087,13630,33.241
4088,13631,24.342
4089,13632,26.891


In [None]:
check_submit['temp'].describe()

count    4091.000000
mean       29.542770
std         4.031607
min        22.936000
25%        26.142000
50%        28.572000
75%        32.492000
max        40.226000
Name: temp, dtype: float64