In [None]:
from google.colab import drive 
drive.mount('/content/gdrive/')

In [None]:
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# util
from datetime import datetime

# N
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.linear_model import LinearRegression

In [None]:
DHCP_data = pd.read_csv("./gdrive/MyDrive/challenges/DHCP.csv")
DHCP_data.head()

In [None]:
train_df = DHCP_data.copy()
train_df['Timestamp'] = train_df.Timestamp.str.split('-').str[0]
train_df['Timestamp'] = pd.to_datetime(train_df['Timestamp'], format='%Y%m%d_%H%M')

train_df.head()

In [None]:
# 결측치 확인
train_df.isnull().sum()

In [None]:
# 결측치 비율
train_df.isnull().sum() / train_df.shape[0]

In [None]:
print(train_df[['Svr_detect']].value_counts())
print(train_df[['Svr_connect']].value_counts())
print(train_df[['Ss_request']].value_counts())
print(train_df[['Ss_Established']].value_counts())

In [None]:
train_df.loc[train_df.Svr_detect.isnull(),'Svr_detect'] = 0.0
train_df.loc[train_df.Svr_connect.isnull(),'Svr_connect'] = 2.0
train_df.loc[train_df.Ss_request.isnull(), 'Ss_request'] = 3.0
train_df.loc[train_df.Ss_Established.isnull(), 'Ss_Established'] = 12.0

In [None]:
#train_df.fillna(0, inplace=True)

train_df.isnull().sum()

In [None]:
def normalization(df):
  return df.apply(lambda value: ((value - np.mean(value)) / np.std(value)))

In [None]:
df_data_cols = train_df.columns.drop(['Timestamp'])
TRAIN_DF = normalization(train_df[df_data_cols])

print(TRAIN_DF)

In [None]:
train = np.array(TRAIN_DF)
#x_train = train.reshape(train.shape[0], 1, train.shape[1])
#x_train.shape
x_train = train.copy()

In [None]:
def linear_model(x):
  keras.backend.clear_session()
  
  model = keras.Sequential(
    [
      layers.Dense(3, input_dim = 4, activation='linear'),
      layers.Dense(4, input_dim = 3, activation='linear')
    ]
  )
  return model


In [None]:
model = linear_model(x_train)
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=5),
             keras.callbacks.ModelCheckpoint(filepath='best_linear_model.h5', monitor='val_loss', save_best_only=True)]

epochs = 50
batch = 16

history = model.fit(x_train, x_train,
                    epochs=epochs, batch_size=batch,
                    validation_split=0.2, callbacks=callbacks).history

model.save('Linear model.h5')

In [None]:
print(x_train.shape)

idx_half = DHCP_data.index[DHCP_data['Timestamp'] == '20210630_2350-0000'].tolist()[0]
# 데이터 분리
train_set = x_train[:idx_half+1]
test_temp_set = x_train[idx_half+1:]

print(train_set.shape)
print(test_temp_set.shape)
print(train_set)
print(test_temp_set)


# 29496 크기로 test set 나누기
# 그 후 predict 하기
# 정답지 만들기

In [None]:
X_pred = model.predict(train_set)

VALID_COLUMNS_IN_TRAIN_DATASET = DHCP_data.columns.drop(['Timestamp'])
print(X_pred)
print(X_pred.shape)
print(train_set)
print(DHCP_data[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
print(DHCP_data.shape)
X_pred = pd.DataFrame(X_pred, columns=DHCP_data[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
#X_pred.reset_index(drop = False, inplace = True)

print("="*50)
print(X_pred)

print(train_set)

scored = pd.DataFrame(index=X_pred[VALID_COLUMNS_IN_TRAIN_DATASET].index)
Xtrain = train_set.copy()
scored['Loss_mae'] = np.mean(np.abs(X_pred-Xtrain), axis = 1)
plt.figure(figsize=(16,9), dpi=80)
plt.title('Loss Distribution', fontsize=16)
sns.distplot(scored['Loss_mae'], bins = 20, kde= True, color = 'blue');
plt.xlim([0.0,.5])

In [None]:
X_pred = model.predict(test_temp_set)

VALID_COLUMNS_IN_TRAIN_DATASET = DHCP_data.columns.drop(['Timestamp'])
print(X_pred)
print(X_pred.shape)
print(train_set)
print(DHCP_data[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
print(DHCP_data.shape)
X_pred = pd.DataFrame(X_pred, columns=DHCP_data[VALID_COLUMNS_IN_TRAIN_DATASET].columns)
#X_pred.reset_index(drop = False, inplace = True)

print("="*50)
print(X_pred)

print(train_set)

scored = pd.DataFrame(index=X_pred[VALID_COLUMNS_IN_TRAIN_DATASET].index)
Xtest = test_temp_set.copy()
scored['Loss_mae'] = np.mean(np.abs(X_pred-Xtest), axis = 1)
plt.figure(figsize=(16,9), dpi=80)
plt.title('Loss Distribution', fontsize=16)
sns.distplot(scored['Loss_mae'], bins = 20, kde= True, color = 'blue');
plt.xlim([0.0,.5])

In [None]:
#Xtest = pd.DataFrame(test_temp_set, columns=DHCP_data[VALID_COLUMNS_IN_TRAIN_DATASET].columns)

scored['Loss_mae'] = np.mean(np.abs(X_pred-Xtest), axis = 1)
scored['Threshold'] = 0.032
prediction_result = scored['Loss_mae'] > scored['Threshold']
scored['Prediction'] = prediction_result
scored.head()
scored.reset_index()
scored.index = scored.index + 26064

print(scored.shape)
print(scored.head())

In [None]:
temp_scored = scored['Prediction'][:]
temp_scored = temp_scored.reset_index(drop=True)
temp_scored

print(temp_scored.head())

answer = pd.DataFrame(temp_scored, columns=['Prediction'])
print(f'예측결과. \n{answer}\n')
answer.to_csv('answer.csv', index=True)

In [None]:
from pandas.core.arrays.numeric import T
data_check = pd.read_csv("./gdrive/MyDrive/mae_threshold_0.024.csv")
my_check = pd.read_csv("./my_check.csv")

data_check.reset_index()

print(data_check.head())
print(my_check.head())

data_check_pre = data_check['Prediction']
my_check_pre = data_check['Prediction']

count = 0

# print(my_check.loc[18, 'Prediction'])
# print(data_check_pre)
# print(my_check_pre)

for i in range(0, data_check_pre.shape[0]):
  if data_check.loc[i, 'Prediction'] != my_check.loc[i, 'Prediction']:
    count += 1

# print(my_check.loc[0, 'Prediction'])

for i in range(0, my_check.shape[0]):
  if my_check.loc[i,'Prediction'] == True:
    my_check.loc[i, 'Prediction'] = 1
  else:
    my_check.loc[i, 'Prediction'] = 0

my_check.to_csv("my_check.csv", index=False)