In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import mean_absolute_error 
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv("/content/drive/My Drive/shell_dataset/train/train/train.csv")
data.drop(columns=['DATE (MM/DD)'],inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# preprocess

In [None]:
def for_minus(data,remove_min1=True):
  df=data.copy()
  cols = ['time', 'global_cmp22', 'direct_snip', 'azimuth', 'dry_bulb', 'wet_bulb', 'dew_point', 'rh', 'cloud_cover',
        'peak_wind_speed', 'avg_wind_direction', 'station_pressure', 'accum_precipitation', 'snow_depth', 'moisture', 'albedo']
  
  df.columns = cols
  if remove_min1==True:
    df.reset_index(inplace=True)

  for i in range(len(df['cloud_cover'])):
    s=1
    if df['cloud_cover'][i]<0:
      nilai2=df['cloud_cover'][i+s]
      while nilai2<0:
        try:
          nilai2=df['cloud_cover'][i+s]
          s+=1
        except:
          nilai2=df['cloud_cover'][i-1]
      try:
       df['cloud_cover'][i]=(df['cloud_cover'][i-1]+nilai2)/2
      except:
        j=1
        while df['cloud_cover'][i+j] < 0:
          j+=1
        df['cloud_cover'][i]=df['cloud_cover'][i+j]
    return df   

In [None]:
def shift_feat(data,periods):
  df=data.copy()
  feats=list(df.columns)
  for col in ['date', 'time','index']:
    try:
      feats.remove(col)
    except: 
      pass

  for feat in feats:
    for num in range(int(periods/back),int(periods+1),int(periods/back)):
      df[feat+str(num)]=df[feat].shift(periods=num)
  try:
    df[~df['azimuth'+str(num)].isnull()]
  except:
    pass
  return df

In [None]:
def resample_mean(df, freq):
  return df.groupby(np.arange(len(df))//freq).mean()

# Training

data

In [None]:
data

Unnamed: 0,MST,Global CMP22 (vent/cor) [W/m^2],Direct sNIP [W/m^2],Azimuth Angle [degrees],Tower Dry Bulb Temp [deg C],Tower Wet Bulb Temp [deg C],Tower Dew Point Temp [deg C],Tower RH [%],Total Cloud Cover [%],Peak Wind Speed @ 6ft [m/s],Avg Wind Direction @ 6ft [deg from N],Station Pressure [mBar],Precipitation (Accumulated) [mm],Snow Depth [cm],Moisture,Albedo (CMP11)
0,00:00,-0.962276,0.000000,356.85640,7.216,0.988,-7.312,32.33,-1,9.95,271.3,806.779,0.0,0.219,0.0,0.0
1,00:01,-0.937921,0.000000,357.65505,7.251,1.040,-7.260,32.40,-1,8.20,272.9,806.840,0.0,0.206,0.0,0.0
2,00:02,-0.944395,0.000000,358.45438,7.256,1.093,-7.207,32.54,-1,6.70,288.8,806.876,0.0,0.148,0.0,0.0
3,00:03,-0.951350,-0.029673,359.25416,7.254,1.060,-7.440,31.89,-1,7.70,294.0,806.823,0.0,0.235,0.0,0.0
4,00:04,-0.934976,-0.054401,0.05415,7.331,1.081,-7.419,31.78,-1,7.20,285.5,806.762,0.0,0.182,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527035,23:55,-1.360910,-0.340704,352.62902,-0.469,-3.940,-10.140,43.61,-1,0.00,0.0,816.186,0.0,2.899,0.0,0.0
527036,23:56,-1.342520,-0.325891,353.41779,-0.499,-3.927,-10.127,43.77,-1,0.00,0.0,816.185,0.0,2.866,0.0,0.0
527037,23:57,-1.341260,-0.320952,354.20842,-0.522,-3.958,-10.158,43.73,-1,0.00,0.0,816.198,0.0,2.882,0.0,0.0
527038,23:58,-1.334130,-0.320953,355.00071,-0.558,-3.979,-10.079,44.17,-1,0.00,0.0,816.194,0.0,2.805,0.0,0.0


In [None]:
data=for_minus(data,remove_min1=True)

# features = data.columns.tolist()
# for col in ['level_0','date', 'time', 'cloud_cover','index']:
#   try:
#     features.remove(col)
#   except:
#     pass
  # params to pay attention to
back = 3
interval_shift = 10
freq = 20

models = []

sampled_data = resample_mean(data,freq)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# filename = f'{folder_path}/model_{horizon}_back{back}_freq{freq}_intshiftnyasetengahhorizon.sav'
# pickle.dump(model, open(filename, 'wb'))
# models.append(model)

In [None]:
folder_path=('/content/drive/MyDrive/alexandra_Shell.ai')

horizons=[30,60,90,120]
models=[]

for horizon in horizons:
  all_training_data=shift_feat(sampled_data,back*horizon/2)
  all_training_data=all_training_data[all_training_data['cloud_cover']!=-1] 
  all_training_data=all_training_data[all_training_data[f'cloud_cover{horizon}']!=-1] 
  all_training_data=all_training_data.dropna()

  features_to_train = all_training_data.columns.tolist()
  for col in ['level_0','date', 'time','index']:
    try:
      features_to_train.remove(col)
    except:
      pass

  all_training_data['target'] = all_training_data['cloud_cover'].shift(int(-horizon/freq))
  all_training_data = all_training_data.dropna()

  model = svm.SVR()

  for col in ['level_0','date', 'time','index','target']:
    try:
      features_to_train.remove(col)
    except:
      pass
  model.fit(all_training_data[features_to_train], all_training_data['target'])

  filename = f'{folder_path}/baru_model_{horizon}_back{back}_freq{freq}.sav'
  pickle.dump(model, open(filename, 'wb'))
  models.append(model)

In [None]:
models

[]

# save and load models

In [None]:
# load
import pickle
folder_path=('/content/drive/MyDrive/alexandra_Shell.ai')

loaded_models=[]
for i in horizons:
  filename = f'{folder_path}/baru_model_{horizon}_back{back}_freq{freq}.sav'
  loaded_model = pickle.load(open(filename, 'rb'))
  loaded_models.append(loaded_model)

*models* for the models trained during the sessions, *loaded_models* for... 
loaded models

In [None]:
# models=loaded_models

[SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
     kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)]

# Predict

In [None]:
t=dict(zip(range(30,121,30), [[]]*4))
folders = [i+1 for i in range(300)]

for i in tqdm(folders):
  dat=pd.read_csv(f"/content/drive/My Drive/shell_dataset/test/test/{i}/weather_data.csv")
  dat=for_minus(dat,remove_min1=True)

  for horizon in [30, 60, 90, 120]:
    test_data = resample_mean(dat,freq = freq)
    test_data=shift_feat(test_data,back*interval_shift)
    test_data = test_data.tail(1)
    test_data = test_data[features_to_train]

    feats=test_data.columns.tolist()
    for col in ['level_0','date', 'time', 'cloud_cover','index','cloud_cover1','cloud_cover2','cloud_cover3']:
      try:
        feats.remove(col)
      except:
        pass

    for f in feats: #not including cloud cover
      s_s = scalers['scaler_'+ f].transform(test_data[f].values.reshape(-1,1))
      s_s=np.reshape(s_s,len(s_s))
      test_data[f]=s_s
    
    t[horizon].append(models[horizon/30-1].predict(test_data)[0])

In [None]:
result = pd.DataFrame({
    'scenario_set': [i+1 for i in range(300)],
    '30_min_horizon': t[30],
    '60_min_horizon': t[60],
    '90_min_horizon': t[90],
    '120_min_horizon': t[120],
})

result = result.set_index('scenario_set')

In [None]:
result.to_csv(folder_path+f'/scaled_back{back}_freq{freq}_int{interval_shift}.csv')

selesai

In [None]:



  linear = LinearRegression()
  tree = DecisionTreeRegressor()
  forest = RandomForestRegressor()
  svr = svm.SVR()

In [None]:
result.to_csv('./svr_back6_freq30_BENER.csv',)