In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE, Isomap
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.decomposition import PCA
import warnings
from sklearn.feature_selection import VarianceThreshold, SelectKBest, RFE
warnings.filterwarnings('ignore')

In [12]:
def testcf(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
    grad = GradientBoostingClassifier().fit(X_train, y_train)
    predictions = grad.predict(X_test)
    print(classification_report(y_test, predictions))

def testrs(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=False)
    grad = GradientBoostingRegressor().fit(X_train, y_train)
    y_pred = grad.predict(X_test)
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'MAPE: {(mean_absolute_percentage_error(y_test, y_pred))}')
    print(f'R^2: {r2_score(y_test, y_pred)}')

def VT(n, X):
    vt = VarianceThreshold(n)
    X_vt = vt.fit_transform(X)
    X_vt = pd.DataFrame(X_vt)
    print(X_vt.columns)
    print(X_vt)
    return X_vt

def scale(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    return X_scaled

def SKB_r(X, y):
    skb = SelectKBest(k=6)
    X_skb = skb.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_skb, y, test_size=0.15, shuffle=False)
    parameters = {'alpha': np.arange(0, 1, 0.1)}
    ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
    y_pred = ridge_optimal.predict(X_test)
    print(X_skb.shape)
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'MAPE: {(mean_absolute_percentage_error(y_test, y_pred))}')
    print(f'R^2: {r2_score(y_test, y_pred)}')
    
def SKB_c(X, y):
    skb = SelectKBest(k=6)
    X_skb = skb.fit_transform(X, y)
    print(X_skb.shape)
    testcf(X_skb, y)
    
def _RFE(X, y, est):
    rfe = RFE(estimator=est, n_features_to_select=4, step=1).fit(X, y)
    X_rfe = pd.DataFrame(rfe.transform(X), columns=rfe.get_feature_names_out())
    print(X_rfe.columns)
    return X_rfe

In [3]:
df_classification = pd.read_csv('D:/ML/data/classification/airlines_task_preprocessed.csv', index_col=0)
df_regression = pd.read_csv('D:/ML/data/regression/moldova_cars_task_preprocessed.csv', index_col=0)
y_cf = df_classification[["Delay"]]
X_cf = df_classification.drop(["Delay"], axis=1)
y_rs = df_regression[["Price(euro)"]]
X_rs = df_regression.drop(["Price(euro)"], axis=1)
print(X_cf.shape, X_rs.shape)

(144471, 23) (36962, 14)


In [76]:
testcf(X_cf, y_cf)

              precision    recall  f1-score   support

           0       0.55      0.86      0.67     10923
           1       0.67      0.28      0.40     10748

    accuracy                           0.57     21671
   macro avg       0.61      0.57      0.53     21671
weighted avg       0.61      0.57      0.53     21671



In [72]:
testrs(X_rs, y_rs)

MAE: 4440.313204322216
MSE: 18043386702.082638
RMSE: 134325.6740243005
MAPE: 0.5719873460464336
R^2: 0.0029335142194614194


In [4]:
X_cf = scale(X_cf)
X_rs = scale(X_rs)

In [80]:
X_vt_cf = VT(1, X_cf)
testcf(X_vt_cf, y_cf)

(144471, 8)
              precision    recall  f1-score   support

           0       0.52      0.88      0.66     10923
           1       0.61      0.18      0.28     10748

    accuracy                           0.54     21671
   macro avg       0.57      0.53      0.47     21671
weighted avg       0.57      0.54      0.47     21671



In [9]:
X_vt_rs = VT(1, X_rs)
testrs(X_vt_rs, y_rs)

RangeIndex(start=0, stop=4, step=1)
              0         1        2         3
0     -1.118436 -1.084684 -0.72541 -0.087525
1      0.894106 -1.084684  1.37853 -0.087525
2      0.894106 -1.084684 -0.72541 -0.087525
3      0.894106 -1.084684  1.37853 -0.087525
4      0.894106 -1.084684  1.37853 -0.087525
...         ...       ...      ...       ...
36957  0.894106  0.921927  1.37853 -0.087525
36958  0.894106 -1.084684  1.37853 -0.087525
36959  0.894106 -1.084684 -0.72541 -0.087525
36960  0.894106 -1.084684  1.37853 -0.087525
36961  0.894106 -1.084684 -0.72541 -0.087525

[36962 rows x 4 columns]
MAE: 6751.224923887392
MSE: 18089076281.658974
RMSE: 134495.63666401588
MAPE: 1.3073483932428933
R^2: 0.00040873606682623187


In [111]:
X_skb_cf = SKB_c(X_cf, y_cf)

(144471, 6)
              precision    recall  f1-score   support

           0       0.55      0.84      0.66     10923
           1       0.65      0.29      0.40     10748

    accuracy                           0.57     21671
   macro avg       0.60      0.57      0.53     21671
weighted avg       0.60      0.57      0.53     21671



In [109]:
X_skb_rs = SKB_r(X_rs, y_rs)

(36962, 6)
MAE: 6192.661916260089
MSE: 18063381323.77912
RMSE: 134400.07932951202
MAPE: 1.1727711226112054
R^2: 0.0018286236842980363


In [116]:
est = DecisionTreeClassifier().fit(X_cf, y_cf)
X_rfe_c = _RFE(X_cf, y_cf, est)
testcf(X_rfe_c, y_cf)

              precision    recall  f1-score   support

           0       0.52      0.92      0.66     10923
           1       0.63      0.13      0.22     10748

    accuracy                           0.53     21671
   macro avg       0.57      0.53      0.44     21671
weighted avg       0.57      0.53      0.44     21671



In [118]:
X_rfe_c.shape

(144471, 4)

In [13]:
est = Ridge().fit(X_rs, y_rs)
X_rfe_r = _RFE(X_rs, y_rs, est)
X_rfe_r.shape
testrs(X_rfe_r, y_rs)

Index(['Year', 'Engine_capacity(cm3)', 'Transmission', 'Style_1'], dtype='object')
MAE: 4603.368033869234
MSE: 18044854561.12116
RMSE: 134331.13771989412
MAPE: 0.5982632435611465
R^2: 0.0028524012290164213


In [122]:
X_rfe_r.shape

(36962, 4)

In [123]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_rs, y_rs)
testrs(X_pca, y_rs)

MAE: 5409.590516652766
MSE: 18057751928.742737
RMSE: 134379.135020072
MAPE: 0.8724710544541506
R^2: 0.0021397006023220477


In [124]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cf, y_cf)
testcf(X_pca, y_cf)

              precision    recall  f1-score   support

           0       0.52      0.86      0.65     10923
           1       0.59      0.20      0.30     10748

    accuracy                           0.53     21671
   macro avg       0.56      0.53      0.47     21671
weighted avg       0.56      0.53      0.48     21671



In [129]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_rs[:10000])
testrs(X_tsne, y_rs[:10000])

MAE: 4143.953075278765
MSE: 54597754.567667834
RMSE: 7389.029338666063
MAPE: 0.7669672342855869
R^2: 0.4587348987157732


In [130]:
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_cf[:50000])
testcf(X_tsne, y_cf[:50000])

              precision    recall  f1-score   support

           0       0.70      0.93      0.80      5075
           1       0.54      0.16      0.25      2425

    accuracy                           0.68      7500
   macro avg       0.62      0.55      0.52      7500
weighted avg       0.65      0.68      0.62      7500



In [132]:
isomap = Isomap(n_components=2)
X_isomap = isomap.fit_transform(X_rs[:3000])
testrs(X_isomap, y_rs[:3000])

MAE: 3839.5736226104655
MSE: 49190320.36058543
RMSE: 7013.581136665165
MAPE: 0.6019541013581632
R^2: 0.35598514574057716


In [133]:
isomap = Isomap(n_components=2)
X_isomap = isomap.fit_transform(X_cf[:10000])
testcf(X_isomap, y_cf[:10000])

              precision    recall  f1-score   support

           0       0.71      0.99      0.83      1056
           1       0.59      0.02      0.04       444

    accuracy                           0.71      1500
   macro avg       0.65      0.51      0.43      1500
weighted avg       0.67      0.71      0.59      1500

