In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

In [None]:
# Load the data
atlantic_data = pd.read_csv('/content/atlantic_cleaned2.0.csv')

#Target selected = Sustained wind knots
y = atlantic_data.maximum_sustained_wind_knots

#Using numerical predictors
atlantic_predictors = atlantic_data.drop(['maximum_sustained_wind_knots'], axis=1)
X = atlantic_predictors.select_dtypes(exclude=['object'])

#Training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [None]:
# Make copy to avoid changing original data (when imputing)
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE:")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))


MAE from Approach 2 (Imputation):
2.7506024096385544


In [None]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

my_model.predict(X_valid)



array([25.983599, 27.17533 , 40.614277, ..., 41.820347, 28.336857,
       42.328716], dtype=float32)