<a href="https://colab.research.google.com/github/mike3071/Weather_Big_Data_Contest/blob/main/tabnet(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/weather_ML/

/content/drive/MyDrive/weather_ML


In [None]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading https://files.pythonhosted.org/packages/94/e5/2a808d611a5d44e3c997c0d07362c04a56c70002208e00aec9eee3d923b5/pytorch_tabnet-3.1.1-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-3.1.1


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder


import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
sale_df = pd.read_csv('./data/sale_data_ohe.csv')
weather_df = pd.read_csv('./data/weather_data.csv', encoding = 'CP949')
weather_df = weather_df[['일시','평균기온(°C)','최저기온(°C)','최고기온(°C)','최소 상대습도(%)','평균 상대습도(%)','O3','PM10']]

# 병합하려고 했는데 날짜 column name이 달라서 '날짜'로 통일
weather_df.rename(columns = {'일시':'날짜'}, inplace = True)
# pandas의 merge_asof method 사용하여 '날짜'를 기준으로 합치는 방법 사용하려고 했는데, '날짜'의 type이 str이라서 integer나 float으로 변경 필요
# pandas의 to_datetime method를 사용하여 type 변경
sale_df['날짜'] = pd.to_datetime(sale_df['날짜'], format='%Y-%m-%d')
sale_df = sale_df.sort_values('날짜')
weather_df['날짜'] = pd.to_datetime(weather_df['날짜'], format = '%Y-%m-%d')
weather_df = weather_df.sort_values('날짜')
# df 순서를 바꾸면 안됨
sale_and_weather_df = pd.merge_asof(sale_df, weather_df, on='날짜', allow_exact_matches=True)
sale_index = list(sale_df.columns[1:])
only_sale_df = sale_and_weather_df[sale_index]
sale_and_weather_df.drop(sale_index, axis = 1, inplace = True)
sale_and_weather_df = pd.concat([sale_and_weather_df, only_sale_df], axis = 1)
sale_and_weather_df.drop(['대분류_냉난방가전', '대분류_뷰티', '대분류_식품'], axis = 1, inplace = True)

In [None]:
# csv 파일로 저장
sale_and_weather_df.to_csv('sale_weather_data_simple.csv', index = False, encoding='CP949')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/weather_ML/sale_weather_data_simple.csv',encoding='cp949')

In [None]:
X_train = pd.read_csv('./data/X_train_weather_columns_6.csv', encoding='cp949')
X_test = pd.read_csv('./data/X_test_weather_columns_6.csv', encoding='cp949')
y_train = pd.read_csv('./data/y_train_weather_columns_6.csv', encoding='cp949')
y_test = pd.read_csv('./data/y_test_weather_columns_6.csv', encoding='cp949')


1. 총판매량

In [None]:
train.drop('날짜',axis=1,inplace=True)

In [None]:
target='일별 판매 합계량(개)'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [None]:
valid_indices

Int64Index([     3,      6,     18,     21,     31,     42,     43,     51,
                53,     65,
            ...
            256653, 256665, 256667, 256670, 256671, 256684, 256702, 256708,
            256713, 256723],
           dtype='int64', length=25899)

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


In [None]:
nunique

In [None]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 100:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("Nan")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)



In [None]:
# categorical_columns.remove('Set')

In [None]:
categorical_columns

In [None]:
valid_indices

Int64Index([     2,      5,     23,     24,     29,     37,     58,     63,
                78,     81,
            ...
            256669, 256671, 256682, 256706, 256709, 256710, 256717, 256725,
            256748, 256756],
           dtype='int64', length=25463)

In [None]:
# Categorical Embedding을 위해 Categorical 변수의 차원과 idxs를 담음.
unused_feat = ['Set']
features = [ col for col in train.columns if col not in unused_feat+[target]] 
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]



X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [None]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=10,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

Device used : cpu


In [None]:
max_epochs = 1000
clf.fit(X_train,y_train)

No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 6.26515 |  0:11:30s
epoch 1  | loss: 5.20473 |  0:22:55s
epoch 2  | loss: 4.69148 |  0:34:14s
epoch 3  | loss: 4.15259 |  0:45:53s
epoch 4  | loss: 3.82146 |  0:57:31s
epoch 5  | loss: 3.6524  |  1:09:13s
epoch 6  | loss: 3.48147 |  1:21:01s
epoch 7  | loss: 3.43884 |  1:32:45s
epoch 8  | loss: 3.33605 |  1:44:27s
epoch 9  | loss: 3.23587 |  1:56:10s
epoch 10 | loss: 3.17171 |  2:07:54s


KeyboardInterrupt: ignored

시간이 너무 많이 걸림

In [None]:
max_epochs = 10
clf.fit(
    X_train=X_train, y_train=y_train,
    max_epochs=max_epochs ,
    patience=50,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=1)

No early stopping will be performed, last training weights will be used.
epoch 0  | loss: 3.12517 |  0:11:20s
epoch 1  | loss: 3.03564 |  0:22:41s
epoch 2  | loss: 3.03716 |  0:33:57s
epoch 3  | loss: 2.97403 |  0:45:12s
epoch 4  | loss: 3.01274 |  0:56:27s
epoch 5  | loss: 2.9675  |  1:07:39s
epoch 6  | loss: 2.93422 |  1:18:51s
epoch 7  | loss: 2.96129 |  1:30:03s
epoch 8  | loss: 2.99091 |  1:41:15s
epoch 9  | loss: 2.94199 |  1:52:23s


In [None]:
preds = clf.predict_proba(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
def rmse(y,pred):
  return np.sqrt(mean_squared_error(y,pred))

In [None]:
RMSE = mean_squared_error(y_test, preds)**0.5

ValueError: ignored

숫자가 다르다고 뜸

In [None]:
y_test

array([ 31., 258.,  84., ..., 407.,  21.,  33.])

In [None]:
len(y_test)

25816

In [None]:
len(preds)

25816

In [None]:
rmse(y_test,preds)

ValueError: ignored

결론: tabnet 사용 못함