In [345]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

In [292]:
train = pd.read_csv('data/train.csv')
train

Unnamed: 0,DATETIME,SERVER,CPU,CPULOAD
0,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-0,3.88
1,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-1,11.33
2,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-10,5.31
3,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-11,5.25
4,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-12,0.28
...,...,...,...,...
312691,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-5,16.17
312692,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-6,16.02
312693,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-7,14.14
312694,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-8,15.42


In [293]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312696 entries, 0 to 312695
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   DATETIME  312696 non-null  object 
 1   SERVER    312696 non-null  object 
 2   CPU       312696 non-null  object 
 3   CPULOAD   312696 non-null  float64
dtypes: float64(1), object(3)
memory usage: 9.5+ MB


In [294]:
#estimation will be made by grouping by hour

train["DATETIME"] = pd.to_datetime(train["DATETIME"], format="%Y-%m-%d %H:%M:%S%z", errors = 'coerce')
train['HOUR'] = train['DATETIME'].dt.strftime('%H:%M')

In [331]:
subdata = train[(train['SERVER']=='CIHAZ_301') & (train['CPU']=='CPU-0') & (train['HOUR']=='00:00')]

model = auto_arima(subdata['CPULOAD'].values, seasonal=False, trace=True)

# En iyi modelin özetini görüntüleyin
print(model.summary())

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=0.13 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=459.600, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=380.822, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=426.462, Time=0.01 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=361.458, Time=0.02 sec
 ARIMA(3,0,0)(0,0,0)[0]             : AIC=361.100, Time=0.02 sec
 ARIMA(4,0,0)(0,0,0)[0]             : AIC=362.556, Time=0.04 sec
 ARIMA(3,0,1)(0,0,0)[0]             : AIC=355.081, Time=0.08 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=356.169, Time=0.05 sec
 ARIMA(4,0,1)(0,0,0)[0]             : AIC=357.080, Time=0.09 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=inf, Time=0.07 sec
 ARIMA(4,0,2)(0,0,0)[0]             : AIC=354.783, Time=0.12 sec
 ARIMA(5,0,2)(0,0,0)[0]             : AIC=inf, Time=0.22 sec
 ARIMA(4,0,3)(0,0,0)[0]             : AIC=inf, Time=0.17 sec
 ARIMA(3,0,3)(0,0,0)[0]             : AIC=357.909, Time=0.10 se

In [342]:
def data_prediction(data):
    
    new_data = []
    index = 0

    for hour in data['HOUR'].unique():
    
        for server in data['SERVER'].unique():
            
            for cpu in data['CPU'].unique():
                
                subdata = data[(data['SERVER']==server) & (data['CPU']==cpu) & (data['HOUR']==hour)]
                X= subdata['CPULOAD'].values
                
                model = ARIMA(X, order=(2, 0, 1))
                model_fit = model.fit()
                next_prediction = model_fit.forecast(steps=1)
                
                dt = "2020-03-31 "+hour+":00+03:00"
                print(f"DATETIME: {dt} - Server: {server} - CPU: {cpu} - Prediction: {next_prediction[0]}")
                
                new_row = {'DATETIME': "2020-03-31 "+hour+":00+03:00", 'SERVER': server, 'CPU': cpu, 'CPULOAD': next_prediction[0]}
                new_data.append(new_row)
                
    return new_data

In [297]:
prediction_data = data_prediction(train) #1,0,0

prediction_df = pd.DataFrame(prediction_data)
merged_df = pd.DataFrame()

for date in prediction_df.DATETIME.unique():
    subdata = prediction_df[prediction_df['DATETIME']==date]
    mean = subdata.groupby('SERVER')['CPULOAD'].mean().reset_index()
    merged_df = pd.concat([merged_df, mean], ignore_index=True)
    
merged_df['CPULOAD'].to_csv('submission.csv',index=True)

DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-0 - Prediction: 6.725183885073303
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-1 - Prediction: 13.12822351499192
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-10 - Prediction: 8.402905158835596
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-11 - Prediction: 7.700849947861788
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-12 - Prediction: 0.830500117124819
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-13 - Prediction: 1.5467247280770664
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-14 - Prediction: 0.9915889137467762
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-15 - Prediction: 0.6604310285393535
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-16 - Prediction: 0.6986760711116917
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-17 - Predicti

In [317]:
order_201 = data_prediction(train)

order_201 = pd.DataFrame(order_201)
merged_df = pd.DataFrame()

for date in order_201.DATETIME.unique():
    subdata = order_201[order_201['DATETIME']==date]
    mean = subdata.groupby('SERVER')['CPULOAD'].mean().reset_index()
    merged_df = pd.concat([merged_df, mean], ignore_index=True)
    
merged_df['CPULOAD'].to_csv('submission_201.csv',index=True)

DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-0 - Prediction: 7.702138179779564
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-1 - Prediction: 12.75962933330479
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-10 - Prediction: 8.424439045072504
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-11 - Prediction: 7.777795956930862
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-12 - Prediction: 0.8002546155483239
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-13 - Prediction: 1.7018604920615938
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-14 - Prediction: 0.9090307893355187
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-15 - Prediction: 0.48259032772197624
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-16 - Prediction: 0.6504314243290393
DATETIME: 2020-03-31 00:00:00+03:00 - Server: CIHAZ_301 - CPU: CPU-17 - Predic

In [318]:
"""
def overrange_data_prediction(data):
    #forecast steps is rising
    new_data = []
    index = 0

    for server in data['SERVER'].unique():
        for cpu in data['CPU'].unique():
                datetime = pd.to_datetime('2020-03-31 00:00:00+03:00')
                subdata = data[(data['SERVER']==server) & (data['CPU']==cpu)]
                X= subdata['CPULOAD'].values
                
                model = ARIMA(X, order=(2,1,2))
                model_fit = model.fit()
                next_prediction = model_fit.forecast(steps=96)
                for pred in next_prediction:
                    new_row = {'DATETIME': datetime, 'SERVER': server, 'CPU': cpu, 'CPULOAD': pred}
                    datetime += timedelta(minutes=15)
                    new_data.append(new_row)
                    
                    
    return new_data

overrange_data = overrange_data_prediction(train)
overrange_df= pd.DataFrame(overrange_data)

overrange_submission=[]
index = 0

for date in overrange_df.DATETIME.unique():
    for server in overrange_df.SERVER.unique():
        subdata = overrange_df[(overrange_df.DATETIME==date) & (overrange_df.SERVER==server)]
        mean = subdata['CPULOAD'].mean()
        new_row={'index':index,'CPULOAD':mean}
        index+=1
        overrange_submission.append(new_row)

overrange_submission       
overrange_submission= pd.DataFrame(overrange_submission)
overrange_submission.to_csv('overrange_submission.csv',index=False)

"""

anomaly detection

In [344]:
"""
%Y: Yıl (örn. 2020)
%m: Ay (örn. 03)
%d: Gün (örn. 24)
%H: Saat (örn. 00)
%M: Dakika (örn. 00)
%S: Saniye (örn. 00)
%z: Saat dilimi ofseti (örn. +03:00)
"""
train

Unnamed: 0,DATETIME,SERVER,CPU,CPULOAD,HOUR
0,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-0,3.88,00:00
1,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-1,11.33,00:00
2,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-10,5.31,00:00
3,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-11,5.25,00:00
4,2020-03-24 00:00:00+03:00,CIHAZ_301,CPU-12,0.28,00:00
...,...,...,...,...,...
312691,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-5,16.17,23:45
312692,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-6,16.02,23:45
312693,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-7,14.14,23:45
312694,2020-05-30 23:45:00+03:00,CIHAZ_305,CPU-8,15.42,23:45


In [354]:
cihaz_301_cpu_mean = train[train['SERVER'] == 'CIHAZ_301'].groupby('HOUR')['CPULOAD'].mean()
cihaz_305_cpu_ort = train[train['SERVER'] == 'CIHAZ_305'].groupby('HOUR')['CPULOAD'].mean()
cihaz_301_cpu_mean

array([ 5.21147727,  5.04603234,  4.45860294,  4.32734453,  4.01518995,
        3.83733456,  3.65904412,  3.56823529,  3.62936887,  3.48566176,
        3.57268995,  3.49552083,  3.71721393,  3.34720771,  3.21416667,
        3.22208955,  2.92942402,  2.95235907,  2.75362745,  2.94449142,
        2.67595588,  2.94738358,  2.7181924 ,  2.90250613,  2.89281863,
        2.96953431,  3.19457721,  3.42072761,  3.63862132,  3.82196691,
        4.25427083,  4.39418505,  4.67167289,  4.87941789,  5.01070466,
        5.53123162,  6.02799632,  6.49919118,  6.65708946,  7.34917279,
        7.64259804,  8.13945466,  8.41647672,  8.88686887,  9.33449142,
        9.59621324, 10.02731618, 10.36458333, 10.78799632, 10.68137255,
       11.06690564, 11.08233456, 11.26226103, 11.42026348, 11.49036765,
       11.81608456, 11.8545527 , 11.93134804, 11.72728554, 12.08501244,
       11.76314338, 11.78130515, 11.70302083, 11.69304534, 11.75139093,
       11.5226348 , 11.51004289, 11.31681985, 11.44877451, 11.07

In [359]:
def anomaly_detection(Data):
    
    for server in train['SERVER'].unique():
        cpu_mean = train[train['SERVER'] == server].groupby('HOUR')['CPULOAD'].mean()
        X = np.reshape(cpu_mean.values, (-1, 1))
        
        clf = IsolationForest(n_estimators=200,max_samples='auto',contamination=float(0.2))  # Anomali oranını belirleme
        clf.fit(X)
        scores = clf.decision_function(X)
        predictions = clf.predict(X)
        
        print(f'Server: {server}\n')
        for i, (score, pred) in enumerate(zip(scores, predictions)):
            print(f"Data point {X[i]} - Anomaly score: {score:.2f} - Classification: {'Anormal' if pred == -1 else 'Normal'}")

In [360]:
anomaly_detection(train)

Server: CIHAZ_301

Data point [5.21147727] - Anomaly score: -0.01 - Classification: Anormal
Data point [5.04603234] - Anomaly score: 0.02 - Classification: Normal
Data point [4.45860294] - Anomaly score: 0.01 - Classification: Normal
Data point [4.32734453] - Anomaly score: 0.02 - Classification: Normal
Data point [4.01518995] - Anomaly score: -0.01 - Classification: Anormal
Data point [3.83733456] - Anomaly score: 0.03 - Classification: Normal
Data point [3.65904412] - Anomaly score: 0.06 - Classification: Normal
Data point [3.56823529] - Anomaly score: 0.06 - Classification: Normal
Data point [3.62936887] - Anomaly score: 0.07 - Classification: Normal
Data point [3.48566176] - Anomaly score: 0.05 - Classification: Normal
Data point [3.57268995] - Anomaly score: 0.06 - Classification: Normal
Data point [3.49552083] - Anomaly score: 0.05 - Classification: Normal
Data point [3.71721393] - Anomaly score: 0.04 - Classification: Normal
Data point [3.34720771] - Anomaly score: 0.01 - Classi