In [None]:
import pandas as pd
import sys

sys.path.append('../../')

from src.modules.pipeline.balancing import Balancing
from src.modules.pipeline.cross_validation import CrossValidation
from src.modules.pipeline.finetunning import Finetunning
from src.modules.preprocess.preprocess import Preprocess
from src.modules.util.util import DataUtil as util
from src.modules.util.constant import Features, Model, ModelName as mn

## Loading Data

In [None]:
TRAIN = pd.read_csv('../data/train_balnced_no_test.csv')
TEST = pd.read_csv('../data/test_umbalanced.csv')

BALANCE_PATH = '../data/balanced/balanced_data.csv'

In [None]:
data_raw = None

NB = Model.NB
LG = Model.LG
DT = Model.DT
RF = Model.RF
GB = Model.GB
CV = Model.CV

x_train, y_train, x_test, y_test, classes = util.get_classification_artifacts(TRAIN, TEST)

## Preprocess

In [None]:
data = pd.DataFrame.copy(data_raw)

In [None]:
train = util.remove_features(TRAIN, Features.train_test_dropped)
test = util.remove_features(TEST, Features.train_test_dropped)

In [None]:
# filing the null values whit empty string
data = Preprocess.fill_null_values(data, Features.description)
data = Preprocess.fill_null_values(data, Features.summary)

# removing features that have mostly empty values
data = util.remove_features(data, Features.features, True)

# transforming non-numeric features in dummy features
data = Preprocess.get_dummy_feature(data)

## Balancing 

##### For balancind the data the follwoing chunks must be executed.

In [None]:
balanced_data = Balancing.oversample(data)

##### Saving balanced data into a csv file

In [None]:
balanced_data.to_csv(BALANCE_PATH, index=False)

## Machine Learning Models

#### Gaussian Naive Bayers

In [None]:
NB_metrics, NB_time = util.get_metrics(NB, mn.NB, x_train, y_train, x_test, y_test, classes)

#### Logistic Regression

In [None]:
LG_metrics, LG_time = util.get_metrics(LG, mn.LG, x_train, y_train, x_test, y_test, classes)

#### Decision Tree

In [None]:
DT_metrics, DT_time = util.get_metrics(DT, mn.DT, x_train, y_train, x_test, y_test, classes)

#### Random Forest

In [None]:
RF_metrics, RF_time = util.get_metrics(RF, mn.RF, x_train, y_train, x_test, y_test, classes)

#### Gradient Boosting

In [None]:
GB_metrics, GB_time = util.get_metrics(GB, mn.GB, x_train, y_train, x_test, y_test, classes)

## Cross Validation

#### Gaussian Naive Bayers

In [None]:
NB_cv = CrossValidation.get_cross_validation_result(NB, x_train, y_train)

#### Logistic Regression

In [None]:
LG_cv = CrossValidation.get_cross_validation_result(LG, x_train, y_train)

#### Decision Tree

In [None]:
DT_cv = CrossValidation.get_cross_validation_result(DT, x_train, y_train)

#### Random Forest

In [None]:
RF_cv = CrossValidation.get_cross_validation_result(RF, x_train, y_train)

#### Gradient Boosting

In [None]:
GB_cv = CrossValidation.get_cross_validation_result(GB, x_train, y_train)

## Finetunning

#### Gaussian Naive Bayers

In [None]:
Finetunning.model_finetuning(NB, mn.NB, x_train, y_train, x_test, y_test, classes, './data/models/naive_bayers/NB_tuned_metrics.pkl', './data/models/naive_bayers/NB_pred.pkl')

#### Logistic Regression

In [None]:
Finetunning.model_finetuning(LG, mn.LG, x_train, y_train, x_test, y_test, classes, './data/models/logistic_regression/LG_tuned_metrics.pkl', './data/models/logistic_regression/LG_pred.pkl')

#### Decision Tree

In [None]:
Finetunning.model_finetuning(DT, mn.DT, x_train, y_train, x_test, y_test, classes, './data/models/decision_tree/DT_tuned_metrics.pkl', './data/models/decision_tree/DT_pred.pkl')

#### Random Forest

In [None]:
Finetunning.model_finetuning(RF, mn.RF, x_train, y_train, x_test, y_test, classes, './data/models/random_forest/RF_tuned_metrics.pkl', './data/models/random_forest/RF_pred.pkl')

#### Gradient Boosting

In [None]:
Finetunning.model_finetuning(GB, mn.GB, x_train, y_train, x_test, y_test, classes, './data/models/gradient_boosting/GB_tuned_metrics.pkl', './data/models/gradient_boosting/GB_pred.pkl')

## Saving results

In [None]:
util.save_result(NB_metrics, NB_time, mn.NB)
util.save_result(LG_metrics, LG_time, mn.LG)
util.save_result(DT_metrics, DT_time, mn.DT)
util.save_result(RF_metrics, RF_time, mn.RF)
util.save_result(GB_metrics, GB_time, mn.GB)