In [1]:
import pandas as pd

data_1 = pd.read_csv('../input/payco_23.csv')
data_2 = pd.read_csv('../input/payco_2304.csv')
df = pd.concat([data_1,data_2])
df = data_2[['사원번호','사용처']].rename({'사원번호':'userid', '사용처':'itemid'}, axis=1).reset_index()

In [2]:
# Step 1: Create user-item matrix
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix

# Create a new DataFrame with frequency count for each user-item pair
df_grouped = df.groupby(['userid', 'itemid']).size().reset_index(name='frequency')

user_u = list(sorted(df_grouped.userid.unique()))
item_u = list(sorted(df_grouped.itemid.unique()))

user_c = CategoricalDtype(sorted(df_grouped['userid'].unique()), ordered=True)
item_c = CategoricalDtype(sorted(df_grouped['itemid'].unique()), ordered=True)

row = df_grouped['userid'].astype(user_c).cat.codes
col = df_grouped['itemid'].astype(item_c).cat.codes
data = df_grouped['frequency'].tolist()

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(item_u)))

df_user_item = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=user_u, columns=item_u)
df_user_item.to_pickle('../input/user_item.pkl')

In [3]:
# Step 2: Define AutoRec model
import torch
from torch import nn

class AutoRec(nn.Module):
    def __init__(self, num_inputs, hidden_units):
        super(AutoRec, self).__init__()

        self.encoder = nn.Linear(num_inputs, hidden_units)
        self.decoder = nn.Linear(hidden_units, num_inputs)
        
    def forward(self, x):
        x = torch.sigmoid(self.encoder(x))
        x = self.decoder(x)
        return x

In [4]:
# Step 3: Train and Test AutoRec model
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Hyperparameters
hidden_units = 500
num_epochs = 100
batch_size = 64
learning_rate = 1e-3

model = AutoRec(df_user_item.shape[1], hidden_units).to(device)
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=learning_rate)


In [6]:
# Create DataLoaders
data = torch.FloatTensor(df_user_item.values).to(device)
dataset = TensorDataset(data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [7]:
import numpy as np 

model.train()
min_loss = np.inf  # 초기의 최소 loss를 무한대로 설정

for epoch in range(num_epochs):
    for i, (inputs,) in enumerate(dataloader):
        inputs = inputs.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # If the current model has the lowest loss, save it
    if loss.item() < min_loss:
        print(f"Minimum Loss Dropped to {loss.item()} at epoch {epoch+1}, saving model...")
        torch.save(model.state_dict(), '../input/autorec_best_model.pt')
        min_loss = loss.item()

Epoch 1/100, Loss: 0.031754862517118454
Minimum Loss Dropped to 0.031754862517118454 at epoch 1, saving model...
Epoch 2/100, Loss: 0.023758795112371445
Minimum Loss Dropped to 0.023758795112371445 at epoch 2, saving model...
Epoch 3/100, Loss: 0.02415415830910206
Epoch 4/100, Loss: 0.016433265060186386
Minimum Loss Dropped to 0.016433265060186386 at epoch 4, saving model...
Epoch 5/100, Loss: 0.012687739916145802
Minimum Loss Dropped to 0.012687739916145802 at epoch 5, saving model...
Epoch 6/100, Loss: 0.008689253591001034
Minimum Loss Dropped to 0.008689253591001034 at epoch 6, saving model...
Epoch 7/100, Loss: 0.010366234928369522
Epoch 8/100, Loss: 0.004158169496804476
Minimum Loss Dropped to 0.004158169496804476 at epoch 8, saving model...
Epoch 9/100, Loss: 0.004018498118966818
Minimum Loss Dropped to 0.004018498118966818 at epoch 9, saving model...
Epoch 10/100, Loss: 0.002779238624498248
Minimum Loss Dropped to 0.002779238624498248 at epoch 10, saving model...
Epoch 11/100, L

In [10]:
# Testing
num_inputs = df_user_item.shape[1]  # 이전에 사용한 입력 차원의 수
hidden_units = 500  # 이전에 사용한 hidden layer의 unit 수

model = AutoRec(num_inputs, hidden_units)

# CPU나 GPU 중에서 사용 가능한 장치를 선택
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델을 해당 장치로 이동
model.to(device)

# 저장된 모델의 가중치를 불러오기
model.load_state_dict(torch.load('autorec_best_model.pt'))

# 모델을 평가 모드로 설정 (dropout이나 batch normalization 등의 동작을 평가 모드로 바꿉니다)
model.eval()
with torch.no_grad():
    inputs = data
    outputs = model(inputs)
    print(outputs)


tensor([[-0.0188,  0.0150, -0.0262,  ..., -0.0051, -0.0026, -0.0056],
        [-0.0185,  0.0048, -0.0264,  ..., -0.0027, -0.0037, -0.0038],
        [-0.0177,  0.0093, -0.0258,  ..., -0.0054, -0.0050, -0.0057],
        ...,
        [-0.0210,  0.0475,  0.0100,  ..., -0.0030,  0.0083,  0.0065],
        [-0.0202,  0.0155, -0.0195,  ..., -0.0058, -0.0050, -0.0057],
        [-0.0266,  0.0387,  0.0035,  ...,  0.0198,  0.0086, -0.0039]],
       device='cuda:0')


In [8]:
# Step 4: Generate recommendations
import numpy as np

def user_free_inference(items, df_user_item, model, top_k=10):
    # Create a new user vector
    user_vector = np.zeros(df_user_item.shape[1])
    item_indices = []

    # Set the chosen items to the maximum value
    for item in items:
        if item in df_user_item.columns:
            item_index = df_user_item.columns.get_loc(item)
            user_vector[item_index] = df_user_item.values.max()
            item_indices.append(item_index)
        else:
            raise ValueError(f"Item {item} not found in the data")

    # Convert to tensor and move to the correct device
    user_vector = torch.FloatTensor([user_vector]).to(device)

    # Generate recommendations
    with torch.no_grad():
        outputs = model(user_vector)
        predicted_ratings = outputs.cpu().numpy()[0]

    # Remove the chosen items from the predictions
    predicted_ratings[item_indices] = -np.inf

    top_k_item_indices = np.argsort(-predicted_ratings)[:top_k]
    recommended_items = df_user_item.columns[top_k_item_indices]
    recommended_scores = predicted_ratings[top_k_item_indices]

    # Convert item and score to dictionary
    item_score_dict = dict(zip(recommended_items.tolist(), recommended_scores.tolist()))

    # Print each item and it score
    for item, score in item_score_dict.items():
        print(f"{item}: {score}")

In [9]:
# Get recommendations
item_list = [
    '킨파',
    '서호돈가스',
    '버거킹(판교유스페이스)',
    '일상화식'
]

user_free_inference(item_list, df_user_item, model)


써브웨이(판교브릿지타워점): 0.5737663507461548
오투닭갈비부대찌개(판교점): 0.22353306412696838
닭갈비야(판교점): 0.21548983454704285
연어랑 스테끼: 0.21266816556453705
카츠소당: 0.2088238149881363
광화문국밥(판교점): 0.20701460540294647
(주)엔바이콘 아시안퀴진: 0.20222622156143188
행복한집숯불갈비: 0.19879905879497528
봉피양(판교점): 0.19253014028072357
역전우동0410(판교역점): 0.1855875551700592
