<a href="https://colab.research.google.com/github/mateopolancecofficial/DeepLearningProjects/blob/main/Regression/Vineyard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA

In [2]:
pip install -q -U keras-tuner


[?25l[K     |███▍                            | 10kB 16.3MB/s eta 0:00:01[K     |██████▉                         | 20kB 22.4MB/s eta 0:00:01[K     |██████████▏                     | 30kB 27.0MB/s eta 0:00:01[K     |█████████████▋                  | 40kB 29.1MB/s eta 0:00:01[K     |█████████████████               | 51kB 10.0MB/s eta 0:00:01[K     |████████████████████▍           | 61kB 10.3MB/s eta 0:00:01[K     |███████████████████████▊        | 71kB 8.3MB/s eta 0:00:01[K     |███████████████████████████▏    | 81kB 8.9MB/s eta 0:00:01[K     |██████████████████████████████▋ | 92kB 8.7MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 5.0MB/s 
[?25h  Building wheel for kt-legacy (setup.py) ... [?25l[?25hdone


In [6]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Input
)

import kerastuner as kt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [7]:
if tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')

In [9]:
# set column names
columns = ['x0', 'x1', 'x2', 'x3', 'u']
# define ranges of all features
feature_ranges = {"x0": [1, 17], "x1": [-9, -4], "x2": [0, 3], "x3": [0, 50]}
# path to data source
source_path = "./Data/podaci.csv"
# split sizes for train, validation and test subsets
train_size, test_size, val_size = 0.8, 0.2, 0.2
# num of new samples
f_range = 100

In [11]:
df = pd.read_csv(source_path, names=columns)

### Prepare datasets
All relevant functions are tested in exploration notebook.

In [12]:
def split_data(df: pd.DataFrame, features: list, target: list, train_size: float, 
               test_size: float, val_size: float):
  """
  Split dataset on train, test and validation subsets.
  :param df:          input pandas DataFrame
  :param features:    list of input features names
  :param target:      list of target column names
  :param train_size:  fraction of train size
  :param test_size:   fraction of test size
  :param val_size:    fraction of validation size
  :return:            dictionary, keys=names of DataFrame, columns=DataFrame
  """
  
  # shuffle dataset
  df = df.sample(frac = 1)
  
  # split on test and train set
  x_train, x_test, y_train, y_test = train_test_split(df[features], df[target],
                                     test_size=test_size, train_size=train_size)
    
  # split train set on train and validation subsets
  x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
                                      test_size=val_size, train_size=train_size)
  
  dataset_dict = {
      'x_train': x_train,
      'y_train': y_train,
      'x_val': x_val,
      'y_val': y_val,
      'x_test': x_test,
      'y_test': y_test
  }

  return dataset_dict

In [13]:
def cause_relevant_features(df: pd.DataFrame, features: list, f_range: int):
  """
  Cause new data based on calculated statistics.
  :param df:          input pandas DataFrame
  :param f_range:     int, num of new samples
  :return new_df:     pandas DataFrame with new data
  """

  new_data = {}

  for idx in range(len(features)):

    std = df[features[idx]].std()
    mean = df[features[idx]].mean()

    data_range = [mean - (1.5 * std), mean + (1.5 * std)]

    new_data[features[idx]] = np.random.uniform(data_range[0], data_range[1], f_range).reshape(f_range, 1).tolist()

  new_df = pd.DataFrame.from_dict(new_data)
  
  for fidx in range(len(features)):
    new_df[features[fidx]] = new_df[features[fidx]].map(lambda x: x[0])

  return new_df

In [14]:
def get_target_values(df, target):
  """
  Return values of target variable based on input feature values.
  :param df:               pandas dataframe with input features
  :return target_result:   pandas dataframe with input features and target values
  """
  
  def calc_target_func(x0, x1, x2, x3):
    """
    Return target value.
    """

    return x0 + (x1**2) + x2 + (2*x3)
  

  df[target[0]] = df.apply(lambda row: calc_target_func(row['x0'], row['x1'], row['x2'], row['x3']), axis=1)

  return df

In [15]:
def create_extra_train_dataset(df, df_new):
  """
  Extend train dataset with new data.
  :param df:               pandas dataframe with input features
  :param df_new:           pandas dataframe with input augmented features
  :return df_result:       pandas dataframe with input features and target values
  """

  return pd.concat([df, df_new], axis=0).reset_index()[features + target]

In [18]:
def normalize_features(df, features, target):
  """
  Use Yeo-Johnson transform to normalize input features and target variable.
  :param df:        dict with input pandas dataframes
  :param features:  list of input features names
  :param target:    list of target variables names
  :return out_data: dict, data = pandas dataframe with transformed data,
                          transformer = transformer objects
  """
  
  out_data = {'data': [], 'transformers': []}
  transformers = []
  df_x = []
  df_y = []

  # normalize input data
  for key in list(df.keys()):

    if key == 'train_data':
      # fit input features
      transformer_f = PowerTransformer(method='yeo-johnson', standardize=True)
      x_trans = transformer_f.fit_transform(df[key][features])
      df_x.append(pd.DataFrame(x_trans, columns=features))
      transformers.append(transformer_f)

      # fit input target
      transformer_t = PowerTransformer(method='yeo-johnson', standardize=True)
      y_trans = transformer_t.fit_transform(df[key][target])
      df_y.append(pd.DataFrame(y_trans, columns=target))
      transformers.append(transformer_t)

    elif key == 'test_data':
      # transform input features
      x_trans = transformers[0].transform(df[key][features])
      df_x.append(pd.DataFrame(x_trans, columns=features))

      # use test target variable
      y_trans = transformers[1].transform(df[key][target])
      df_y.append(pd.DataFrame(y_trans, columns=target))
    
    else:
      # transform input features
      x_trans = transformers[0].transform(df[key][features])
      df_x.append(pd.DataFrame(x_trans, columns=features))

      # transform target variable
      y_trans = transformers[1].transform(df[key][target])
      df_y.append(pd.DataFrame(y_trans, columns=target))

  # concatenate all transformed features dataframes with transformed target variables
  for i in range(len(list(df))):
    out_data['data'].append(pd.concat([df_x[i], df_y[i]], axis=1))
  
  out_data['transformers'] = transformers

  return out_data

In [21]:
def pca(x):
  """
  Get new features by PCA analysis.
  :param x:                 pandas dataframe with input features
  :return train_pca, pca:   pandas dataframe with transformed data, PCA object
  """

  # Create principal components
  pca = PCA()
  x_pca = pca.fit_transform(x)

  # Convert to dataframe
  component_names = [f"PC{i+1}" for i in range(x_pca.shape[1])]
  train_pca = pd.DataFrame(x_pca, columns=component_names)

  return train_pca, pca

In [None]:
def run_data_prep():
  """
    Prepare data sets.
  """
  df_data = {}
  df_extend_data = {}

  # split dataset on train, validation and test subsets
  target = [columns.pop(-1)]
  features = columns
  dataset = split_data(df, features, target, train_size, test_size, val_size)

  # create train dataset
  x_train = dataset['x_train'].reset_index()[features]
  y_train = dataset['y_train'].reset_index()[target]
  train_data = pd.concat([x_train, y_train], axis=1)
  df_data['train_data'] = train_data

  # create validation dataset
  x_val = dataset['x_val'].reset_index()[features]
  y_val = dataset['y_val'].reset_index()[target]
  val_data = pd.concat([x_val, y_val], axis=1)
  df_data['val_data'] = val_data
  
  # create test dataset
  x_test = dataset['x_test'].reset_index()[features]
  y_test = dataset['y_test'].reset_index()[target]
  test_data = pd.concat([x_test, y_test], axis=1)
  df_data['test_data'] = test_data

  # normalize input data
  out_data = normalize_features(df_data, features, target)

  # create new dataset with more input samples
  new_data = cause_relevant_features(dataset['x_train'], features, f_range)
  new_data = get_target_values(new_data, target)
  new_data = new_data.reset_index()[features + target]
  train_new_data = create_extra_train_dataset(train_data, new_data).reset_index()[features + target]
  df_extend_data['train_data'] = train_new_data

  # normalize input data
  out_new_data = normalize_features(df_extend_data, features, target)

  return 
