# LSC Project 2021 Land Sat Temp Machine Learning (oabreu_sjaisha_gdmorrison)

In [9]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, \
                            precision_score, recall_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import KFold, cross_val_score
import datetime
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Let's start by reading our Landsat Images Dataframe composed of all the features we extracted/engineered using parallelization from the band data into the environment.

In [10]:
df = pd.read_csv('df.csv')
display(df.head())
display(df.dtypes)
display(df.describe)

Unnamed: 0.1,Unnamed: 0,ndvi,ndsi,ndbi,albedo,awei,gemi,LST,ndvi_lag,ndsi_lag,ndbi_lag,albedo_lag,awei_lag,gemi_lag,LST_lag,community,period,year
0,0,-0.001353,0.160216,-0.16208,35211.452719,-39322.297037,-8950.6,60442.880642,0.023026,0.10062,-0.126802,29398.857511,-46058.546948,-2635934.0,58471.296655,1.0,1.0,2013.0
1,1,0.052082,0.027594,-0.081344,20396.886124,-54411.458403,-5064028.0,55952.798809,0.056454,0.074444,-0.13258,27708.233651,-45450.828152,-8235884.0,55530.927603,2.0,1.0,2013.0
2,2,-0.000755,0.145312,-0.147978,33458.12378,-42316.319196,-2537.676,60335.224238,0.010063,0.126829,-0.140341,31230.287639,-44155.25566,-792326.3,59912.212295,3.0,1.0,2013.0
3,3,0.023069,0.083137,-0.108543,26713.035479,-52164.178781,-1567635.0,58195.901687,0.059352,0.049161,-0.111084,24086.595185,-48213.85625,-7546935.0,55973.688755,4.0,1.0,2013.0
4,4,0.021965,0.091599,-0.11681,26258.111186,-48977.51528,-1387080.0,59745.383688,0.055677,0.036199,-0.09629,22838.399906,-49854.421375,-5900835.0,56810.798083,5.0,1.0,2013.0


Unnamed: 0      int64
ndvi          float64
ndsi          float64
ndbi          float64
albedo        float64
awei          float64
gemi          float64
LST           float64
ndvi_lag      float64
ndsi_lag      float64
ndbi_lag      float64
albedo_lag    float64
awei_lag      float64
gemi_lag      float64
LST_lag       float64
community     float64
period        float64
year          float64
dtype: object

<bound method NDFrame.describe of       Unnamed: 0      ndvi      ndsi      ndbi        albedo          awei  \
0              0 -0.001353  0.160216 -0.162080  35211.452719 -39322.297037   
1              1  0.052082  0.027594 -0.081344  20396.886124 -54411.458403   
2              2 -0.000755  0.145312 -0.147978  33458.123780 -42316.319196   
3              3  0.023069  0.083137 -0.108543  26713.035479 -52164.178781   
4              4  0.021965  0.091599 -0.116810  26258.111186 -48977.515280   
...          ...       ...       ...       ...           ...           ...   
1227        1227  0.150078 -0.089786 -0.064232  13437.457902 -60905.720894   
1228        1228  0.182797 -0.104233 -0.078286  11414.670254 -55002.990956   
1229        1229  0.140152 -0.058311 -0.082249  14477.054750 -57817.026513   
1230        1230  0.130767 -0.086133 -0.051520  14604.813826 -65351.372919   
1231        1231  0.096510 -0.052797 -0.042047  13599.960836 -53408.709878   

              gemi           

In [11]:
default_split = {0: [[2015], 2016],
                 1: [[2015, 2016], 2017],
                 2: [[2015, 2016, 2017], 2018],
                 3: [[2015, 2016, 2017, 2018], 2019],
                 4: [[2015, 2016, 2017, 2018, 2019], 2020]}
                 
test_year = 2020
default_ycol = "LST"
default_selection_param = "RMSE"

In [12]:
def pipe_normalize(df, scaler=None, outputinc=False, outputcol=None):
    '''
    Normalizes dataframe (adapted from Nick Feamster's normalize function)
    Inputs:
        df (Pandas Dataframe)
        scaler (Scaler) :If scaler is not none, use given scaler's means and sds
                         to normalize (input for test set case); else, set 
                         scaler in function
        outputinc (bool): If output is included, set aside to ensure it does not
                          get normalized, default False
        outputcol (str): If output is included, name of output column, default
                         None
    Returns tuple of:
        Normalized DataFrame and scaler used to normalize DataFrame
    '''
    columns = df.columns
    if outputinc:
        outcomes = df.loc[:,outputcol]
        df = pd.DataFrame(df.drop(outputcol, axis=1))
    if scaler is None:
        scaler = StandardScaler()
        normalized_features = scaler.fit_transform(df) 
    else:
        normalized_features = scaler.transform(df)

    normalized_df = pd.DataFrame(normalized_features)
    if outputinc:
        normalized_df[outputcol] = outcomes.tolist()

    normalized_df.index=df.index
    normalized_df.columns= columns

    return normalized_df, scaler

In [13]:
def train_val_test_split(df, split = default_split, ycol = default_ycol):
    k = len(split)

    df_train = [pd.DataFrame(columns = list(df.columns))]*k
    df_val = [pd.DataFrame(columns = list(df.columns))]*k
    df_test = df[df["year"] == test_year]

    for i in range(k):
        for train_yr in split[i][0]:
            df_train[i] = df_train[i].append(df[df["year"] == train_yr])
        df_val[i] = df_val[i].append(df[df["year"] == split[i][1]])

    df_train_y = [None]*k
    df_train_x = [None]*k
    df_val_y = [None]*k
    df_val_x = [None]*k

    for i in range(k):
        df_train_y[i] = df_train[i][ycol]
        df_train_x[i] = df_train[i].drop(columns = [ycol, "year"])
        df_val_y[i] = df_val[i][ycol]
        df_val_x[i] = df_val[i].drop(columns = [ycol, "year"])
        df_test_y = df_test[ycol]
        df_test_x = df_test.drop(columns = [ycol, "year"])

    return df_train_y, df_train_x, df_val_y, df_val_x, df_test_y, df_test_x

def normalize(df_train_x, df_val_x, df_test_x):
    k = len(df_train_x)
    train_norm = []
    valid_norm = []
    for n in range(k):
        df = pd.concat((df_train_x[n], df_val_x[n]))
        df_norm, scaler = pipe_normalize(df)
        tr_norm = df_norm.loc[df_train_x[n].index,:]
        val_norm = df_norm.loc[df_val_x[n].index,:]
        train_norm.append(tr_norm)
        valid_norm.append(val_norm)
    te_norm, _ = pipe_normalize(df_test_x, scaler=scaler)
    test_norm = te_norm
    return train_norm, valid_norm, test_norm

def grid_search_time_series_cv(df_train_y, df_train_x, df_val_y, df_val_x,
                               models, p_grid, ret_int_results = False, print = False):
    k = len(df_train_y)
    val_results = [pd.DataFrame(columns = ["Model", "Params", "RMSE", "MAE", "R^2"])]*k

    for i in range(k):
        for model_key in models.keys():
            for params in p_grid[model_key]:
                if print == True:
                    print("Training model:", model_key, "|", params)
                model = models[model_key]
                model.set_params(**params)
                fitted_model = model.fit(df_train_x[i], df_train_y[i])
                test_predictions = fitted_model.predict(df_val_x[i])
                rmse = mean_squared_error(df_val_y[i], test_predictions, squared = False)
                mae = mean_absolute_error(df_val_y[i], test_predictions)
                r2 = r2_score(df_val_y[i], test_predictions)
                val_results[i] = val_results[i].append(pd.DataFrame([[model_key, params, rmse, mae, r2]],
                                                       columns = ["Model", "Params", "RMSE", "MAE", "R^2"]))

    avg_val_results = pd.DataFrame(columns = ["Model", "Params", "RMSE", "RMSE std dev", "MAE", "R^2"])
    avg_val_results["Model"] = val_results[0]["Model"]
    avg_val_results["Params"] = val_results[0]["Params"]
    avg_val_results["RMSE"] = [0]*len(val_results[0])
    avg_val_results["RMSE std dev"] = [0]*len(val_results[0])
    avg_val_results["MAE"] = [0]*len(val_results[0])
    avg_val_results["R^2"] = [0]*len(val_results[0])
    for i in range(k):
        avg_val_results["RMSE"] += val_results[i]["RMSE"]/k
        avg_val_results["MAE"] += val_results[i]["MAE"]/k
        avg_val_results["R^2"] += val_results[i]["R^2"]/k
    avg_val_results = avg_val_results.reset_index().drop(columns = ["index"])
    l0 = list(val_results[0]["RMSE"])
    l1 = list(val_results[1]["RMSE"])
    l2 = list(val_results[0]["RMSE"])
    for i in range(len(avg_val_results)):
        avg_val_results.iloc[i, [3]] = np.std([l0[i], l1[i], l2[i]])

    if ret_int_results == True:
        return avg_val_results, val_results
    else:
        return avg_val_results

def select_best_model(avg_val_results, selection_param = default_selection_param):
    best_model = avg_val_results[avg_val_results[selection_param] == avg_val_results[selection_param].min()].iloc[0]
    return best_model

def select_model(avg_val_results, row):
    chosen_model = avg_val_results.iloc[row]
    return chosen_model

def test_model(df_train_y, df_train_x, df_val_y, df_val_x, df_test_y, df_test_x,
               chosen_model, models):
    k = len(df_train_y)
    model = models[chosen_model["Model"]]
    model.set_params(**chosen_model["Params"])

    df_tv_x = pd.concat([df_train_x[k-1], df_val_x[k-1]])
    df_tv_y = pd.concat([df_train_y[k-1], df_val_y[k-1]])

    fitted_model = model.fit(df_tv_x, df_tv_y)
    test_predictions = fitted_model.predict(df_test_x)
    rmse = mean_squared_error(df_test_y, test_predictions, squared = False)
    mae = mean_absolute_error(df_test_y, test_predictions)
    r2 = r2_score(df_test_y, test_predictions)
    test_results = {"RMSE" : rmse, "MAE" : mae, "r^2" :r2}
    return test_results

def choose_and_test_model(df, models, p_grid, n_splits = 3, ycol = default_ycol, selection_param = default_selection_param):
    df_train_y, df_train_x, df_val_y, df_val_x, df_test_y, df_test_x = train_val_test_split(df, default_split, ycol)
    df_train_x, df_val_x, df_test_x = normalize(df_train_x, df_val_x, df_test_x)
    print(df_train_x)
    avg_val_results = grid_search_time_series_cv(df_train_y, df_train_x, df_val_y, df_val_x, models, p_grid)
    best = select_best_model(avg_val_results, selection_param)
    test_results = test_model(df_train_y, df_train_x, df_val_y, df_val_x, df_test_y, df_test_x, best, models)
    return test_results, best


def find_features(df, model_pd):
    dfs = grid_search.train_val_test_split(df)
    df_train_y, df_train_x, df_val_y, df_val_x, df_test_y, df_test_x = dfs
    df_train_x, df_val_x, df_test_x = grid_search.normalize(df_train_x,
                                                            df_val_x, df_test_x)
    k = len(df_train_x)
    df_tv_x = [pd.DataFrame(columns = list(df_train_x[0].columns))] * k
    df_tv_y = [pd.Series()]*k

    for i in range(k):
        df_tv_x[i] = df_tv_x[i].append(df_train_x[i]).append(df_val_x[i])
        df_tv_y[i] = df_tv_y[i].append(df_train_y[i]).append(df_val_y[i])

    model = models[model_pd["Model"]]
    params = model_pd["Params"]
    model.set_params(**params)
    model.fit(df_tv_x[k-1], df_tv_y[k-1])
    if model_pd["Model"] == "RandomForestRegressor":
        features = model.feature_importances_
    else:
        features = model.coef_
        print(model.intercept_)
    n = len(features)
    coefs = pd.DataFrame(np.round(features.reshape(n, 1), decimals=2),
                         index=df_tv_x[k-1].columns, columns=["coef"])
    predictions = model.predict(df_test_x)
    results = pipeline.evaluate(df_test_y, predictions)

    return coefs.sort_values(by="coef",axis=0, ascending=False), results

In [14]:
models = {"LinearRegression" : LinearRegression(),
          "Ridge" : Ridge(),
          "Lasso" : Lasso(),
          "ElasticNet" : ElasticNet()}

p_grid = {"LinearRegression" : [{}],
          "Ridge" : [{"alpha" : x} for x in [.1, .5, 1, 5, 10, 50, 100, 500, 1000]],
          "Lasso" : [{"alpha" : x} for x in [.1, .5, 1, 5, 10, 50, 100, 500, 1000]],
          "ElasticNet" : [{"alpha" : x,
                         "l1_ratio" : y} 
                          for x in [.1, 1, 10, 100, 1000] 
                          for y in [.1, .3, .5, .7, .9]]}

In [15]:
top_models = []
test_results, best = choose_and_test_model(df, models, p_grid)
top_models.append(test_results)
display(top_models)

[     Unnamed: 0      ndvi      ndsi      ndbi    albedo      awei      gemi  \
462   -1.235573 -0.127705  0.190835  0.017366 -0.344679  0.930189  0.376215   
463   -1.232360  0.718575 -0.461630 -0.622668 -0.650827  0.731863 -0.459134   
464   -1.229146  1.275649 -1.000442 -0.866223 -0.808674  0.077350 -1.226386   
465   -1.225933  1.124804 -0.797935 -0.855318 -0.809771  0.622364 -0.883591   
466   -1.222719 -0.044657 -0.230762  0.511884 -0.771469  1.542522  0.583847   
..          ...       ...       ...       ...       ...       ...       ...   
611   -0.756768  1.055102 -0.987760 -0.541602 -0.921439  0.580971 -0.695138   
612   -0.753555  1.649440 -1.318038 -1.216190 -0.919708 -0.058382 -1.714369   
613   -0.750341  1.675178 -1.165867 -1.477933 -0.989254  0.515362 -1.581334   
614   -0.747128  0.910303 -1.280157  0.305334 -0.371683 -2.106850 -1.302963   
615   -0.743915  0.213708 -0.520332  0.477203 -0.730386  0.764488  0.234275   

     ndvi_lag  ndsi_lag  ndbi_lag  albedo_lag  awe

[{'RMSE': 2234.5921950901215,
  'MAE': 1746.3930809121025,
  'r^2': 0.8055356557070152}]