# Greedy search using Skorch

In [None]:
!pip install skorch
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from skorch import NeuralNetRegressor
from sklearn.model_selection import GridSearchCV
from datetime import datetime
from collections import defaultdict

## Load dataset

In [None]:
# Load and preprocess the data
df = pd.read_csv("you'r dataset")
user_logins = df['column_1'].value_counts()
print('Number of total users:', len(user_logins))

## Preprocessing

In [None]:
# Filter data for users with more than 50 logins
filtered_user_logins = user_logins[user_logins > 50]
df_user_logins_gt50 = df[df['column_1'].isin(filtered_user_logins.index)].copy()

In [None]:
# Add time features
df_user_logins_gt50['timestamp'] = pd.to_datetime(df_user_logins_gt50['time'])
df_user_logins_gt50['date'] = df_user_logins_gt50['timestamp'].dt.date
df_user_logins_gt50['hour'] = df_user_logins_gt50['timestamp'].dt.hour

In [None]:
# Create hourly login vectors
user_hour = df_user_logins_gt50.groupby(['column_1', 'hour']).size().unstack(fill_value=0)
user_vectors = user_hour.to_numpy()

In [None]:
# Define time range for login vector processing
start_time = datetime.strptime("2024-01-17 12:00:31", "%Y-%m-%d %H:%M:%S")
end_time = datetime.strptime("2024-07-05 11:57:40", "%Y-%m-%d %H:%M:%S")
total_hours = int((end_time - start_time).total_seconds() // 3600) + 1

In [None]:
# Generate login vectors
user_login_vectors = defaultdict(lambda: np.zeros(total_hours, dtype=int))
for _, row in df.iterrows():
    username = row["column_1"]  # Adjust based on actual column name
    device_time = row["time"]
    login_time = datetime.strptime(device_time, "%Y-%m-%d %H:%M:%S")
    if start_time <= login_time <= end_time:
        hour_index = int((login_time - start_time).total_seconds() // 3600)
        user_login_vectors[username][hour_index] += 1

In [None]:
# Convert login vectors to DataFrame
login_data = {
    "username": [],
    "logins": []  # Column to hold login counts as lists
}
# Populate the DataFrame dictionary
for username, login_vector in user_login_vectors.items():
    login_data["username"].append(username)
    login_data["logins"].append(login_vector.tolist())  # Convert the numpy array to a list

In [None]:
# Create the DataFrame
df_login_vectors = pd.DataFrame(login_data)

# Choose a sample user
sample = np.array(df_login_vectors.iloc[3, 1])
print(sample)
# train-test split for time series
train_size = int(len(sample) * 0.8)
test_size = len(sample) - train_size
train, test = sample[:train_size].reshape((-1, 1)), sample[train_size:].reshape((-1, 1))

In [None]:
# Define a function to create datasets for LSTM
def create_dataset(dataset, lookback):
    X, y = [], []
    for i in range(len(dataset) - lookback):
        feature = dataset[i:i + lookback]
        target = dataset[i + lookback]
        X.append(feature)
        y.append(target)
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

lookback = 24
X_train, y_train = create_dataset(train, lookback=lookback)
X_test, y_test = create_dataset(test, lookback=lookback)

### LSTM model 

In [None]:
# Define the LSTM model
class LoginModel(nn.Module):
    def __init__(self, hidden_size=20):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)

    # In your model's forward function
    def forward(self, x):
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = self.linear(x)
        return x

## Greedy search function

In [None]:
# Initialize Skorch NeuralNetRegressor
# Create model with skorch
model = NeuralNetRegressor(
  module=LoginModel,
  criterion=nn.MSELoss(),
  verbose=False
)

# Define the grid search parameters
param_grid = {
    'optimizer': [optim.SGD, optim.Adam, optim.Adagrad],
    'optimizer__lr': [0.001, 0.0001],
    'module__hidden_size': [12, 24, 36, 48, 60, 72],
    'batch_size': [128, 256, 512],
    'max_epochs': [2000, 3000, 4000]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))