In [1]:
# Imports
import warnings
warnings.filterwarnings("ignore")

import numpy
import pandas
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, average_precision_score
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score

import import_ipynb
import data_analysis

importing Jupyter notebook from data_analysis.ipynb


#### SVM

In [2]:
def results_summary_clf(ytrain_true, yval_true, ytest_true, ytrain_pred, yval_pred, ytest_pred):
    # for global model
    Summary_index = ['Training set', 'Validation set', 'Testing set']
    Summary_columns = ['Accuracy', 'F1_score', 'Precision_score']
    Summary_results = pandas.DataFrame(index=Summary_index, columns=Summary_columns)

    # performance of the global model
    Summary_results.loc[Summary_index[0], Summary_columns[0]] = round(accuracy_score(ytrain_true, ytrain_pred)*100, 2)
    Summary_results.loc[Summary_index[0], Summary_columns[1]] = round(f1_score(ytrain_true, ytrain_pred)*100, 2)
    Summary_results.loc[Summary_index[0], Summary_columns[2]] = round(average_precision_score(ytrain_true, ytrain_pred)*100, 2)
    
    Summary_results.loc[Summary_index[1], Summary_columns[0]] = round(accuracy_score(yval_true, yval_pred)*100, 2)
    Summary_results.loc[Summary_index[1], Summary_columns[1]] = round(f1_score(yval_true, yval_pred)*100, 2)
    Summary_results.loc[Summary_index[1], Summary_columns[2]] = round(average_precision_score(yval_true, yval_pred)*100, 2)
    
    Summary_results.loc[Summary_index[2], Summary_columns[0]] = round(accuracy_score(ytest_true, ytest_pred)*100, 2)
    Summary_results.loc[Summary_index[2], Summary_columns[1]] = round(f1_score(ytest_true, ytest_pred)*100, 2)
    Summary_results.loc[Summary_index[2], Summary_columns[2]] = round(average_precision_score(ytest_true, ytest_pred)*100, 2)
    
    return Summary_results

In [3]:
def C_validation_clf(X_train, X_val, y_train, y_val, kernel, class_weights=None): 
    n_samples, n_features = X_train.shape
    #C_params, scores = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], []
    C_params, scores = numpy.arange(1, 5.25, 0.25) * n_samples, []

    for C in C_params:
        clf = SVC(kernel=kernel, C=C, gamma='scale', class_weight=class_weights, random_state=40)
        clf.fit(X_train, y_train)
        scores.append(clf.score(X_val, y_val))
    return C_params[numpy.argmax(scores)]

In [None]:
def classifier_svm(data, target_name, synthetic_data_flag=False, train_size=0.70, kernel='linear', C_param=100, C_validation=False, times=1, return_flag='simple'):
    print('*********************************************** The SVMs ***********************************************')
    print(f'Training_set = {round((train_size * 100))}%, Validation_set = {round(((1 - train_size)/2) * 100)}%, Test_set = {round(((1 - train_size)/2) * 100)}%, kernel = {kernel}, C_validation = {C_validation}, times = {times}')

    # uncouping X and y
    X, y = data_analysis.uncouping_x_y_clf(data.copy(), target_name)
    
    All_acc_score = []
    random_states = numpy.arange(times)
    for i, random_state in enumerate(random_states):
        # split the dataset X into the training set X_train and temporary set X_temp
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size = train_size, stratify=y, random_state=random_state)
        # split the dataset X_temp into the validation set X_val and testing set X_test
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size = 0.5, stratify=y_temp, random_state=0)
        X_train, X_val, X_test = data_analysis.reset_index_data(data_1=X_train, data_2=X_val, data_3=X_test, data_4=None)
        
        # data encoding (target encoding for category variables) and scaling (example : 'TargetEncoder', 'OrdinalEncoder', etc...)
        if synthetic_data_flag:
            X_train_enc, X_val_enc, X_test_enc = X_train.values.copy(), X_val.values.copy(), X_test.values.copy()
        else:
            X_train_enc, X_val_enc, X_test_enc = data_analysis.data_processing(xtrain=X_train.copy(), ytrain=y_train.copy(), xtest_1=X_val.copy(), xtest_2=X_test.copy(), xtest_3=None, check_multicollinearity=True)
        
        class_weights = None
        
        # fitting model
        C_param =  C_validation_clf(X_train_enc.copy(), X_val_enc.copy(), y_train.copy(), y_val.copy(), kernel, class_weights=class_weights) if C_validation == True else C_param
        clf_global = SVC(C=C_param, kernel=kernel, class_weight=class_weights, random_state=40)
        clf_global.fit(X_train_enc, y_train)
        
        # predictions
        y_train_preds = clf_global.predict(X_train_enc)
        y_val_preds = clf_global.predict(X_val_enc)
        y_test_preds = clf_global.predict(X_test_enc)
    
        # get summary
        summary_random = results_summary_clf(y_train, y_val, y_test, y_train_preds, y_val_preds, y_test_preds)
        All_acc_score.append(summary_random['Accuracy'].values)
        if (i == 0):
            if kernel=='linear':
                w_random_state = clf_global.coef_[0]
                b_random_state = numpy.round(clf_global.intercept_[0], 6)
            summary_random_state = summary_random
        else:
            if kernel=='linear':
                w_random_state = w_random_state + clf_global.coef_[0]
                b_random_state = b_random_state + numpy.round(clf_global.intercept_[0], 6)
            summary_random_state = summary_random_state + summary_random
            
    # end of for loop
    if kernel=='linear':
        w = w_random_state / times
        b = b_random_state / times
    summary = (summary_random_state / times).astype('float64')
    summary['Std_accuracy'] = numpy.std(numpy.array(All_acc_score, dtype='float64'), axis=0) # add column for accuracy standard deviation
    summary = summary.round(2)
    
    if kernel=='linear':
        print(f'w = {w}, b = {numpy.round(b, 6)}, C_param ={C_param}')
        
        if return_flag=='simple':
            return C_param, w, b, summary
        else:
            return C_param, w, b, summary, X_train_enc, X_test_enc, y_train.copy(), y_test.copy(), y_train_preds.copy(), y_test_preds.copy()
    else:
        print(f'C_param ={C_param}')
        return C_param, summary



#### Kernel regression

In [1]:
def results_summary_reg(ytrain_true, yval_true, ytest_true, ytrain_pred, yval_pred, ytest_pred):
    # for global model
    Summary_index = ['Training set', 'Validation set', 'Testing set']
    Summary_columns = ['R2_score', 'RMSE', 'MSE']
    Summary_results = pandas.DataFrame(index=Summary_index, columns=Summary_columns)

    # performance of the global model
    Summary_results.loc[Summary_index[0], Summary_columns[0]] = round(r2_score(ytrain_true, ytrain_pred), 4)
    Summary_results.loc[Summary_index[0], Summary_columns[1]] = round(root_mean_squared_error(ytrain_true, ytrain_pred), 4)
    Summary_results.loc[Summary_index[0], Summary_columns[2]] = round(mean_squared_error(ytrain_true, ytrain_pred), 4)
    
    Summary_results.loc[Summary_index[1], Summary_columns[0]] = round(r2_score(yval_true, yval_pred), 4)
    Summary_results.loc[Summary_index[1], Summary_columns[1]] = round(root_mean_squared_error(yval_true, yval_pred), 4)
    Summary_results.loc[Summary_index[1], Summary_columns[2]] = round(mean_squared_error(yval_true, yval_pred), 4)
    
    Summary_results.loc[Summary_index[2], Summary_columns[0]] = round(r2_score(ytest_true, ytest_pred), 4)
    Summary_results.loc[Summary_index[2], Summary_columns[1]] = round(root_mean_squared_error(ytest_true, ytest_pred), 4)
    Summary_results.loc[Summary_index[2], Summary_columns[2]] = round(mean_squared_error(ytest_true, ytest_pred), 4)
    
    return Summary_results

In [2]:
def C_validation_reg(X_train, X_val, y_train, y_val, kernel): 
    n_samples, n_features = X_train.shape
    #C_params, r2_scores = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], []
    C_params, r2_scores = numpy.arange(1, 5.25, 0.25) * n_samples, []

    for C in C_params:
        reg_global = SVR(kernel=kernel, C=C, gamma='scale', epsilon=0.1, coef0=1)
        reg_global.fit(X_train, y_train)
        r2_scores.append(reg_global.score(X_val, y_val))
    return C_params[numpy.argmax(r2_scores)]

In [None]:
def kernel_regression(data, target_name, synthetic_data_flag=False, train_size=0.70, kernel='linear', C_param=100, C_validation=False, times=1, return_flag='simple'):
    print('*********************************************** The Non Linear Regression ***********************************************')
    print(f'Training_set = {round((train_size * 100))}%, Validation_set = {round(((1 - train_size)/2) * 100)}%, Test_set = {round(((1 - train_size)/2) * 100)}%, kernel = {kernel}, C_validation = {C_validation}, times = {times}')

    # uncouping X and y
    X, y = data_analysis.uncouping_x_y_reg(data, target_name)
    
    random_states = numpy.arange(times)
    for i, random_state in enumerate(random_states):
        # split the dataset X into the training set X_train and temporary set X_temp
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size = train_size, random_state=random_state)
        # split the dataset X_temp into the validation set X_val and testing set X_test
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size = 0.5, random_state=0)
        X_train, X_val, X_test = data_analysis.reset_index_data(data_1=X_train, data_2=X_val, data_3=X_test, data_4=None)
    
        # data encoding (target encoding for category variables) and scaling 
        if synthetic_data_flag:
            X_train_enc, X_val_enc, X_test_enc = X_train.values.copy(), X_val.values.copy(), X_test.values.copy()
        else:
            X_train_enc, X_val_enc, X_test_enc = data_analysis.data_processing(xtrain=X_train.copy(), ytrain=y_train.copy(), xtest_1=X_val.copy(), xtest_2=X_test.copy(), xtest_3=None, check_multicollinearity=True)
   
        # fitting model
        C_param = C_validation_reg(X_train_enc.copy(), X_val_enc.copy(), y_train.copy(), y_val.copy(), kernel) if C_validation == True else C_param
        reg_global = SVR(kernel=kernel, C=C_param, gamma='scale', epsilon=0.1, coef0=1)
        reg_global.fit(X_train_enc, y_train)

        # predictions
        y_train_preds = reg_global.predict(X_train_enc)
        y_val_preds = reg_global.predict(X_val_enc)
        y_test_preds = reg_global.predict(X_test_enc)
    
        # get summary
        summary_random = results_summary_reg(y_train, y_val, y_test, y_train_preds, y_val_preds, y_test_preds)
        if (i == 0):
            summary_random_state = summary_random
        else:
            summary_random_state += summary_random
    
    # end of for loop
    summary = (summary_random_state / times).astype('float64')
    summary = summary.round(4)
    
    if return_flag=='simple':
        return summary
    else:
        return summary, X_train_enc, X_test_enc, y_train.copy(), y_test.copy(), y_train_preds.copy(), y_test_preds.copy()

#### Linear regression

In [None]:
def linear_regression(data, target_name, synthetic_data_flag=False, train_size=0.70, times=1, return_flag='simple'):
    print('*********************************************** The Linear Regression ***********************************************')
    print(f'Training_set = {round((train_size * 100))}%, Validation_set = {round(((1 - train_size)/2) * 100)}%, Test_set = {round(((1 - train_size)/2) * 100)}%, times = {times}')

    # uncouping X and y
    X, y = data_analysis.uncouping_x_y_reg(data, target_name)
    
    random_states = numpy.arange(times)
    j = 0
    for i, random_state in enumerate(random_states):
        # split the dataset X into the training set X_train and temporary set X_temp
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size = train_size, random_state=random_state)
        # split the dataset X_temp into the validation set X_val and testing set X_test
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size = 0.5, random_state=0)
        X_train, X_val, X_test = data_analysis.reset_index_data(data_1=X_train, data_2=X_val, data_3=X_test, data_4=None)
    
        # data encoding (target encoding for category variables) and scaling #'TargetEncoder', #'OrdinalEncoder'
        if synthetic_data_flag:
            X_train_enc, X_val_enc, X_test_enc = X_train.values.copy(), X_val.values.copy(), X_test.values.copy()
        else:
            X_train_enc, X_val_enc, X_test_enc = data_analysis.data_processing(xtrain=X_train.copy(), ytrain=y_train.copy(), xtest_1=X_val.copy(), xtest_2=X_test.copy(), xtest_3=None, check_multicollinearity=True)

        # fitting model
        reg_global = LinearRegression()
        reg_global.fit(X_train_enc, y_train)

        # predictions
        y_train_preds = reg_global.predict(X_train_enc)
        y_val_preds = reg_global.predict(X_val_enc)
        y_test_preds = reg_global.predict(X_test_enc)
    
        # get summary
        summary_random = results_summary_reg(y_train, y_val, y_test, y_train_preds, y_val_preds, y_test_preds)
        if (i == 0):
            w_random_state = reg_global.coef_
            b_random_state = numpy.round(reg_global.intercept_, 6)
            summary_random_state = summary_random
        else:
            if len(w_random_state) != len(reg_global.coef_):
                summary_random_state += summary_random
                j += 1
            else :
                w_random_state += reg_global.coef_
                b_random_state += numpy.round(reg_global.intercept_, 6)
                summary_random_state += summary_random
    
    # end of for loop
    w, b = (w_random_state / (times-j)), (b_random_state / (times-j))
    summary = (summary_random_state / times).astype('float64')
    summary = summary.round(4)
    
    print(f'w = {w}, b = {numpy.round(b, 6)}')
    
    if return_flag=='simple':
        return w, b, summary
    else:
        return w, b, summary, X_train_enc, X_test_enc, y_train.copy(), y_test.copy(), y_train_preds.copy(), y_test_preds.copy()