#Import libraries

In [1]:
import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sb
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import gc

#Load data

In [2]:
train=pd.read_table('/Users/nguyenhoangngocha21/Documents/GitHub//WiDS-2023/train_data.csv', delimiter=',')
test=pd.read_table('/Users/nguyenhoangngocha21/Documents/GitHub//WiDS-2023/test_data.csv', delimiter=',')

In [3]:
train=train.drop('index', axis=1) #drop index column
test=test.drop('index', axis=1)

In [4]:
train.head()

Unnamed: 0,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


In [5]:
print('Train:', train.shape, "| Test:", test.shape)

Train: (375734, 245) | Test: (31354, 244)


In [6]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375734 entries, 0 to 375733
Columns: 245 entries, lat to wind-vwnd-925-2010-20
dtypes: float64(240), int64(3), object(2)
memory usage: 702.3+ MB
None


#EDA

In [7]:
#print columns with categorical data
train.select_dtypes(include='object').columns

Index(['startdate', 'climateregions__climateregion'], dtype='object')

# Data preprocessing

## Fill missing values

In [8]:
#Find the Null-containing columns  
null_cols = train.columns[train.isna().any()].tolist()
null_cols

['nmme0-tmp2m-34w__ccsm30',
 'nmme-tmp2m-56w__ccsm3',
 'nmme-prate-34w__ccsm3',
 'nmme0-prate-56w__ccsm30',
 'nmme0-prate-34w__ccsm30',
 'nmme-prate-56w__ccsm3',
 'nmme-tmp2m-34w__ccsm3',
 'ccsm30']

In [9]:
null_cols_test = test.columns[test.isna().any()].tolist()

In [10]:
#impute (fill) missing values using sklearn's Simple Imputer (fill the mean value along each column)
for col in null_cols: 
    train[col] = train[col].interpolate()
train.columns[train.isna().any()].tolist()

[]

# Feature engineering

## Encoding regions

In [11]:
#rewrite as function later
def encode_regions(train, test): 
    encoder = LabelEncoder()
    new_climate_regions = encoder.fit_transform(train['climateregions__climateregion'])
    train['climateregions__climateregion'] = new_climate_regions
    test['climateregions__climateregion'] = encoder.transform(test['climateregions__climateregion'])
    cli_reg_map = {index: new_climate_regions for index, new_climate_regions in 
                      enumerate(encoder.classes_)}
    print(cli_reg_map)
    #Check for difference climate regions between train and test sets
    train_regions = train.climateregions__climateregion.value_counts
    test_regions = train.climateregions__climateregion.value_counts
    #print(train_regions == test_regions) #train and test on same climate regions
    #train and test set cover resembling climate regions
    return train, test

In [12]:
encode_regions(train, test)
train['climateregions__climateregion'].unique()

{0: 'BSh', 1: 'BSk', 2: 'BWh', 3: 'BWk', 4: 'Cfa', 5: 'Cfb', 6: 'Csa', 7: 'Csb', 8: 'Dfa', 9: 'Dfb', 10: 'Dfc', 11: 'Dsb', 12: 'Dsc', 13: 'Dwa', 14: 'Dwb'}


array([ 0,  4,  1,  3,  2,  6,  7,  5,  9, 12, 10,  8, 11, 13, 14])

## Location encoding

Encode location using latitude and longitude, round values to 14th decimal digits.
Reference: `https://www.kaggle.com/code/flaviafelicioni/wids-2023-different-locations-train-test-solved`

In [13]:
def group_loc(train, test):
    #Scale lat and lon to 14th decimal digit
    scale = 14
    train.loc[:,'lat'] = round(train.lat, scale)
    train.loc[:,'lon'] = round(train.lon, scale)
    test.loc[:,'lat'] = round(test.lat, scale)
    test.loc[:,'lon'] = round(test.lon, scale)
    # Concatenate train and test data
    all_df = pd.concat([train, test], axis=0)
    # Create new feature
    all_df['loc_group'] = all_df.groupby(['lat','lon']).ngroup()
    print(f'{all_df.loc_group.nunique()} unique locations')
    
    # Split back up
    train = all_df.iloc[:len(train)]
    test = all_df.iloc[len(train):]

    #Check result
    print('Locations in train that are not in test')
    locations_train=list(train.loc_group.unique())
    locations_test=list(test.loc_group.unique())
    result_1 = list(set(locations_train).difference(locations_test))
    print(result_1)

    print('Locations in test that are not in train')
    result_2=list(set(locations_test).difference(locations_train))
    print(result_2)
    
    return train, test

In [14]:
group_loc(train, test)

514 unique locations
Locations in train that are not in test
[]
Locations in test that are not in train
[]


(        lat       lon startdate  contest-pevpr-sfc-gauss-14d__pevpr  \
 0       0.0  0.833333    9/1/14                              237.00   
 1       0.0  0.833333    9/2/14                              228.90   
 2       0.0  0.833333    9/3/14                              220.69   
 3       0.0  0.833333    9/4/14                              225.28   
 4       0.0  0.833333    9/5/14                              237.24   
 ...     ...       ...       ...                                 ...   
 375729  1.0  0.866667   8/27/16                              312.05   
 375730  1.0  0.866667   8/28/16                              305.82   
 375731  1.0  0.866667   8/29/16                              311.62   
 375732  1.0  0.866667   8/30/16                              304.54   
 375733  1.0  0.866667   8/31/16                              295.29   
 
         nmme0-tmp2m-34w__cancm30  nmme0-tmp2m-34w__cancm40  \
 0                          29.02                     31.64   
 1      

## Cyclical date conversion

In [15]:
#add season
def add_season(df):
  month_to_season = {
      1: 0,
      2: 0,
      3: 1,
      4: 1,
      5: 1,
      6: 2,
      7: 2,
      8: 2, 
      9: 3, 
      10: 3,
      11: 3,
      12: 0
  }
  df['season'] = df['month'].apply(lambda x: month_to_season[x])

In [16]:
def convert_time(df):
    df['year'] = df['DATE'].dt.year
    df['month'] = df['DATE'].dt.month
    df['day'] = df['DATE'].dt.day_of_year
    df['week'] = df['DATE'].dt.isocalendar().week
    df['quarter'] = df['DATE'].dt.quarter
    return df

In [17]:
from sklearn.preprocessing import FunctionTransformer

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))

def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def encode_cyclical(df):
    # encode the day with a period of 365
    df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day'])
    df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day'])

    # encode the week with a period of 52
    df['week_sin'] = sin_transformer(52).fit_transform(df['week'])
    df['week_cos'] = cos_transformer(52).fit_transform(df['week'])

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df['month_cos'] = cos_transformer(12).fit_transform(df['month'])

    # encode the season with a period of 4
    df['season_sin'] = sin_transformer(4).fit_transform(df['season'])
    df['season_cos'] = cos_transformer(4).fit_transform(df['season'])
    
    # encode the quarter with a period of 4
    df['quarter_sin'] = sin_transformer(4).fit_transform(df['quarter'])
    df['quarter_cos'] = cos_transformer(4).fit_transform(df['quarter'])

In [18]:
#Encode start date
train['DATE'] = pd.to_datetime(train['startdate'])
test['DATE'] = pd.to_datetime(test['startdate'])

convert_time(train)
add_season(train)

convert_time(test)
add_season(test)

encode_cyclical(train)
encode_cyclical(test)

In [19]:
train.head(10)

Unnamed: 0,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,...,day_of_year_sin,day_of_year_cos,week_sin,week_cos,month_sin,month_cos,season_sin,season_cos,quarter_sin,quarter_cos
0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.871706,-0.490029,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.880012,-0.474951,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.888057,-0.459733,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.895839,-0.444378,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.903356,-0.428892,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
5,0.0,0.833333,9/6/14,237.87,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.910605,-0.413279,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
6,0.0,0.833333,9/7/14,236.36,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.917584,-0.397543,-0.935016,-0.354605,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
7,0.0,0.833333,9/8/14,233.36,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.924291,-0.381689,-0.970942,-0.239316,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
8,0.0,0.833333,9/9/14,233.82,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.930724,-0.365723,-0.970942,-0.239316,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16
9,0.0,0.833333,9/10/14,229.74,29.02,31.64,29.57,30.73,29.71,31.52,...,-0.936881,-0.349647,-0.970942,-0.239316,-1.0,-1.83697e-16,-1.0,-1.83697e-16,-1.0,-1.83697e-16


In [32]:
#Split x, y 
target = 'contest-tmp2m-14d__tmp2m'
drop_cols = ['DATE', 'startdate', target, 'week']
y = train[target]
X = train.drop([x for x in drop_cols if x in train.columns], axis=1)
X_test = test.drop([x for x in drop_cols if x in test.columns], axis=1)

In [33]:
#defining metric
def rmse(actual, predicted):
    return mean_squared_error(actual, predicted, squared=False)

# Modeling & Cross Validation 

In [34]:
#Train tesst
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)
print(f'Train_shape: {X_train.shape}    |   Val_shape: {X_val.shape}    |   Test_shape: {X_test.shape}')

Train_shape: (251741, 258)    |   Val_shape: (123993, 258)    |   Test_shape: (31354, 258)


In [23]:
#pip install catboost

In [24]:
#pip install xgboost

In [25]:
'''
import lightgbm as lgb

# set up parameters for LightGBM
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'max_depth': 4,
          'num_leaves': 31,
          'learning_rate': 0.05,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.8,
          'bagging_freq': 5,
          'early_stopping_round': 50,
          'n_estimators': 15000}

reg_lgb = lgb.LGBMRegressor(**params)

reg_lgb.fit(X_train, y_train, eval_set=(X_test, y_val),verbose=100)

y_pred_cat = bst2.predict(X_test)

y_pred_lgb = reg_lgb.predict(X_test)
'''

"\nimport lightgbm as lgb\n\n# set up parameters for LightGBM\nparams = {'boosting_type': 'gbdt',\n          'objective': 'regression',\n          'metric': 'rmse',\n          'max_depth': 4,\n          'num_leaves': 31,\n          'learning_rate': 0.05,\n          'feature_fraction': 0.9,\n          'bagging_fraction': 0.8,\n          'bagging_freq': 5,\n          'early_stopping_round': 50,\n          'n_estimators': 15000}\n\nreg_lgb = lgb.LGBMRegressor(**params)\n\nreg_lgb.fit(X_train, y_train, eval_set=(X_test, y_val),verbose=100)\n\ny_pred_cat = bst2.predict(X_test)\n\ny_pred_lgb = reg_lgb.predict(X_test)\n"

In [36]:
import xgboost as xgb
'''
reg_xgb = xgb.XGBRegressor(base_score=0.5, 
                           n_estimators=15000, #chon so nao be thoiiii
                           objective='reg:linear',
                           max_depth=4,
                           early_stopping_rounds=100,
                           learning_rate=0.01)

reg_xgb.fit(X_train, y_train, eval_set=((X_val, y_val),), verbose=100)

y_pred_xgb = reg_xgb.predict(X_val)
'''

[0]	validation_0-rmse:14.88583
[100]	validation_0-rmse:5.89425
[200]	validation_0-rmse:2.81578
[300]	validation_0-rmse:1.86675
[400]	validation_0-rmse:1.55411
[500]	validation_0-rmse:1.40694
[600]	validation_0-rmse:1.32627
[700]	validation_0-rmse:1.26963
[800]	validation_0-rmse:1.22734
[900]	validation_0-rmse:1.18661
[1000]	validation_0-rmse:1.15332
[1100]	validation_0-rmse:1.12401
[1200]	validation_0-rmse:1.09827
[1300]	validation_0-rmse:1.07509
[1400]	validation_0-rmse:1.05302
[1500]	validation_0-rmse:1.03341
[1600]	validation_0-rmse:1.01332
[1700]	validation_0-rmse:0.99758
[1800]	validation_0-rmse:0.98000
[1900]	validation_0-rmse:0.96475
[2000]	validation_0-rmse:0.95201
[2100]	validation_0-rmse:0.93796
[2200]	validation_0-rmse:0.92439
[2300]	validation_0-rmse:0.91235
[2400]	validation_0-rmse:0.90092
[2500]	validation_0-rmse:0.88979


KeyboardInterrupt: 

# Ensembling prediction

# Hyperparameter tuning

# Train on full data

#Submission

#Reference