## Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Dataset/Link_Level/DS_SMF_LinkLevel_1e4_uniform_211204.csv")
data.head()

## Data Analysis

## Data Visualization

In [None]:
fig = data.hist(bins=50, figsize=(50, 30), xlabelsize=10, ylabelsize=10)
[x.title.set_size(32) for x in fig.ravel()]

In [None]:
plt.figure()
fig,ax = plt.subplots(figsize=(30,15))
sns.boxplot(x = 'Lspan(KM)', y = 'GSNRSpan(dB)',  data = data,ax=ax)
plt.title('Lspan(KM) vs GSNRSpan(dB)', fontdict = {'fontsize' : 30})
plt.xlabel('Lspan(KM)', fontsize=18)
plt.ylabel('GSNRSpan(dB)', fontsize=16)
ticks = plt.setp(ax.get_xticklabels(),rotation=90)
plt.show()

In [None]:
clean_data = data   # data set doesn't need cleaning

In [None]:
X = clean_data.iloc[:,:-1]
y = clean_data.iloc[:,-1]

## Data Preprocessing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Pipeline

In [None]:
numerical_columns = list(X_train)
categorical_columns = ["Rs(GBu)"] # based on trial and error and also histograms this column can be considered as a categorical feature

In [None]:
# creating a pipieline for numerical features which normalizes all the 
# given features as well as filling some missing data (which is not really necessary here)
numerical_pipeline = Pipeline([                     
        ('data_filler', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
# implementing all the changes by 'fit_transform'
X_train_numerical = numerical_pipeline.fit_transform(X_train)

In [None]:
# the same is done for categorical data by ignoring the unknown elements while categorizing
pipeline = ColumnTransformer([
        ("numerical", numerical_pipeline, numerical_columns),
        ("categorical", OneHotEncoder(handle_unknown = "ignore"), categorical_columns),
    ])

## Models

In [None]:
# defining a function to print the evaluation metrics to avoid any duplicated line of code in the nest parts
def rmse_r2_mae(model,y,y_predict):    
    rmse = (np.sqrt(mean_squared_error(y, y_predict)))
    r2 = r2_score(y, y_predict)
    mae = mean_absolute_error(y,y_predict)
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))
    print('MAE score is {}'.format(mae))

In [None]:
# the same an previous part but with different inputs when predicted data is not accessible
def score_rmse_r2_mae(model,X,y):    
    rmse = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error').mean()
    r2 = cross_val_score(model, X, y, cv=5, scoring='r2').mean()
    mae = cross_val_score(model, X, y, cv=5, scoring='mean_absolute_error').mean()
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))
    print('MAE score is {}'.format(mae))

In [None]:
def get_score_predict(model,X_train,y_train,X_test,y_test):
    print("\nThe model performance for training set")
    print("--------------------------------------")
    score_rmse_r2_mae(model,X_train,y_train)
    print("\nThe model performance for validation set")
    print("--------------------------------------")
    score_rmse_r2_mae(model,X_test,y_test)

In [None]:
def test_score(model,X,y):
    print("\nThe model performance for testing set")
    print("--------------------------------------")
    score_rmse_r2_mae(model,X,y)

In [None]:
def get_model_grid_search(model, parameters, X, y, pipeline):
    
    X = pipeline.fit_transform(X)    
    # definng R2 as scoring method
    random_search = RandomizedSearchCV(model,
                            param_distributions=parameters,
                            scoring='r2',
                            verbose=1, n_jobs=-1,
                            n_iter=1000)
    
    grid_result = random_search.fit(X, y)
    
    print('Best R2: ', grid_result.best_score_)
    print('Best Params: ', grid_result.best_params_)  
  
    # definng MAE as scoring method
    random_search2 = RandomizedSearchCV(model,
                            param_distributions=parameters,
                            scoring='neg_mean_absolute_error',
                            verbose=1, n_jobs=-1,
                            n_iter=1000)
    
    grid_result2 = random_search2.fit(X, y)
    
    print('Best MAE: ', grid_result2.best_score_)
    print('Best Params: ', grid_result2.best_params_) 
  
    # definng RMSE as scoring method
    random_search3 = RandomizedSearchCV(model,
                            param_distributions=parameters,
                            scoring='neg_root_mean_squared_error',
                            verbose=1, n_jobs=-1,
                            n_iter=1000)
    
    grid_result3 = random_search3.fit(X, y)
    
    print('Best RMSE: ', grid_result3.best_score_)
    print('Best Params: ', grid_result3.best_params_) 
    
    return random_search.best_estimator_

In [None]:
def get_model_random_search(model, parameters, X, y, pipeline):
    
    X = pipeline.fit_transform(X)    
    clf = GridSearchCV(model, parameters, scoring='r2',cv=5,verbose=1, n_jobs=-1)
    grid_result = clf.fit(X, y)
      
    # definng R2 as scoring method
    print('Best R2: ', grid_result.best_score_)
    print('Best Params: ', grid_result.best_params_) 
  
    # definng MAE as scoring method
    clf2 = GridSearchCV(model, parameters, scoring='neg_mean_absolute_error',cv=5,verbose=1, n_jobs=-1)
    grid_result2 = clf2.fit(X, y)

    print('Best MAE: ', grid_result2.best_score_)
    print('Best Params: ', grid_result2.best_params_)
  
    # definng RMSE as scoring method
    clf3 = GridSearchCV(model, parameters, scoring='neg_root_mean_squared_error',cv=5,verbose=1, n_jobs=-1)
    grid_result3 = clf3.fit(X, y)

    print('Best MSAE: ', grid_result3.best_score_)
    print('Best Params: ', grid_result3.best_params_)
    
    return clf.best_estimator_

In [None]:
# running 10 fold cross validation for evaluation of the input model and returning the mean of all 10 scores 
def k_fold_score(model, X ,y):
    kf = KFold(n_splits = 5)
    rmse_list = []
    r2_list = []
    mae_list = []
    for train_index, test_index in kf.split(X, y):
        X_train,X_test = X.iloc[train_index],X.iloc[test_index]
        y_train,y_test = y.iloc[train_index],y.iloc[test_index]

        X_train = pipeline.fit_transform(X_train)
        X_test = pipeline.transform(X_test)
        
        model.fit(X_train,y_train)
        y_predict = model.predict(X_test)

        rmse = (np.sqrt(mean_squared_error(y_test, y_predict)))
        r2 = r2_score(y_test, y_predict)
        mae = mean_absolute_error(y_test,y_predict)

        rmse_list.append(rmse)
        r2_list.append(r2)
        mae_list.append(mae)


    rmse_list = np.array(rmse_list)
    r2_list = np.array(r2_list)
    mae_list = np.array(mae_list)

    print("--------------------------------------")
    print('RMSE is {}'.format(rmse_list.mean()))
    print('R2 score is {}'.format(r2_list.mean()))
    print('MAE score is {}'.format(mae_list.mean()))

In [None]:
# translating dataset as defined in the pipeline
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [None]:
# seperating the data set to be used in grid search and cross validation
data_gs, data_cv, target_gs, target_cv = train_test_split(X, y, test_size=0.95, random_state=42)

### LinearRegression

In [None]:
# final evaluation and plotting hist for the difference between predicted and real labels
from matplotlib.ticker import PercentFormatter
def get_predict(model,X_train,y_train,X_test,y_test):
    print("\nThe model performance for training set")
    print("--------------------------------------")
    y_predict = model.predict(X_train)
    rmse_r2_mae(model,y_train,y_predict)
    print("\nThe model performance for testing set")
    print("--------------------------------------")
    y_predict = model.predict(X_test)
    rmse_r2_mae(model,y_test,y_predict)
    plt.figure(figsize=(5,5))
    diff = (y_test - y_predict)
    n,bins,rects= plt.hist(diff, bins = 25,weights=np.ones(len(diff)) / len(diff))
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.grid()
    plt.xlim((-1,1))
    #rects = ax.patches   
    '''for rect in rects:
      height = rect.get_height()
      plt.text(rect.get_x() + rect.get_width() / 2, height+0.01, str(int(height//0.001)/10)+'%',
              ha='center', va='bottom')'''
    plt.xlabel('Prediction error', fontsize=18)
    plt.ylabel('Frequency', fontsize=18)
    plt.figure()
    count, bins_count = np.histogram(diff, bins=25)
    # finding the PDF of the histogram using count values
    pdf = count / sum(count)
      
    # using numpy np.cumsum to calculate the CDF
    # We can also find using the PDF values by looping and adding
    cdf = np.cumsum(pdf)
      
    # plotting PDF and CDF
    plt.plot(bins_count[1:], pdf, color="red", label="PDF")
    plt.plot(bins_count[1:], cdf, label="CDF")
    plt.legend()

In [None]:
# final evaluation and plotting hist for the difference between predicted and real labels
from matplotlib.ticker import PercentFormatter
def get_predict2(model,X_train,y_train,X_test,y_test):
    '''print("\nThe model performance for training set")
    print("--------------------------------------")
    y_predict = model.predict(X_train)
    rmse_r2_mae(model,y_train,y_predict)
    print("\nThe model performance for testing set")
    print("--------------------------------------")'''
    y_predict = model.predict(X_test)
    #rmse_r2_mae(model,y_test,y_predict)
    plt.figure(figsize=(5,5))
    diff = (y_test - y_predict)
    n,bins,rects= plt.hist(diff, bins = 12,weights=np.ones(len(diff)) / len(diff))
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.grid()
    plt.xlim((-1,1))
    #rects = ax.patches   
    for rect in rects:
      height = rect.get_height()
      plt.text(rect.get_x() + rect.get_width() / 2, height+0.01, str(int(height//0.001)/10)+'%',
              ha='center', va='bottom')
    plt.xlabel('Prediction error', fontsize=18)
    plt.ylabel('Frequency', fontsize=18)

### Cross Validation

In [None]:
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
get_predict(lin_model,X_train,y_train,X_test,y_test)

### random Search

**Elastic-Net Regression**

Elastic-net is a linear regression model that combines the penalties of Lasso and Ridge.

In [None]:
# indicating the potential parameters to be used in trial and error 
params = {
    'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],       
    'l1_ratio':[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
}

en = ElasticNet()

pipeline = ColumnTransformer([
        ("numerical", numerical_pipeline, numerical_columns),
        ("categorical", OneHotEncoder(handle_unknown = "ignore"), categorical_columns),
    ])
# calling random search for elastic net 
en_model = get_model_random_search(en, params, data_gs, target_gs, pipeline)

### K Fold

In [None]:
k_fold_score(en_model,data_cv, target_cv) # on training data (80%) using 10_fold cross validation of elastic net algorithm

## Support Vector Machine Regressor

In [None]:
svr = SVR(kernel='rbf',C=100)
svr.fit(X_train, y_train)
get_predict(svr,X_train,y_train,X_test,y_test)

### Random Search

In [None]:
# doing the same thing as elastic net for SVR
params = {  'C': [0.1, 1, 10, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
            'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        }

svr = SVR(kernel='rbf')

pipeline = ColumnTransformer([
        ("numerical", numerical_pipeline, numerical_columns),
        ("categorical", OneHotEncoder(handle_unknown = "ignore"), categorical_columns),
    ])

svr_rs_model = get_model_random_search(svr, params, data_gs, target_gs, pipeline)

In [None]:
k_fold_score(svr_rs_model, data_cv, target_cv)

# **Result:**

# ***linear reg:***


*   The model performance for training set
    --------------------------------------
    RMSE is 1.8406440524613408\
    R2 score is 0.8459335740915573\
    MAE score is 1.37154352825972




*    The model performance for testing set
    --------------------------------------
    RMSE is 1.8031524314000584\
    R2 score is 0.8442903378589373\
    MAE score is 1.333618957614806

#    ***Elastic net*** :

*    Best R2:  0.8424886379654106
    
*    Best MAE:  1.3502965110395042
    
*    Best MSAE:  1.841129702786808

*    Best Params:  alpha: 0.1, l1_ratio: 1

# ***SVR***

*   The model performance for training set
    --------------------------------------
    RMSE is 0.2488812643948367\
    R2 score is 0.9971832208788407\
    MAE score is 0.14058093534326982

*    The model performance for testing set
    --------------------------------------
    RMSE is 0.30315675650110846\
    R2 score is 0.9955986573274681\
    MAE score is 0.16715031899685914
#    random search:
*   Best R2:  0.9934061443981413
*Best MAE:  0.2709478856261804
*Best MSAE:  0.3758950035569143
*Best Params:  C: 1000, 'epsilon': 0.05, 'gamma': 0.005

