# LEVEL 10 - Feature Eng.

In [15]:
# Essentials
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings(action="ignore")

In [16]:
# Load CSV file into a pandas DataFrame
file_path = "./data/train_data.csv"
data = pd.read_csv(file_path)
# Removing wrong values
data.dropna(inplace=True)
data = data[data['AMPS'].between(0, 1)]
data['UNIT'][data['TEMP']>100] = 'K'
data['UNIT'][data['TEMP']<100] = 'C'
data['TEMP'][data['TEMP']>100] = data['TEMP'][data['TEMP']>100] - 273.25
data.reset_index(drop=True, inplace=True)
# OneHotEncoding
encoder = OneHotEncoder(sparse=False)
encoded_test_data = encoder.fit_transform(data[['POWER', 'UNIT']])
df_encoded_test  = pd.DataFrame(encoded_test_data, columns=encoder.get_feature_names(['POWER', 'UNIT']))
data = pd.concat([data, df_encoded_test], axis=1)
data = data.drop(['POWER', 'UNIT'], axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 971 entries, 0 to 970
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MODE        971 non-null    object 
 1   AMPS        971 non-null    float64
 2   VOLTS       971 non-null    float64
 3   TEMP        971 non-null    float64
 4   DELTA       971 non-null    float64
 5   GAMMA       971 non-null    float64
 6   OUTPUT      971 non-null    float64
 7   POWER_high  971 non-null    float64
 8   POWER_low   971 non-null    float64
 9   UNIT_C      971 non-null    float64
 10  UNIT_K      971 non-null    float64
dtypes: float64(10), object(1)
memory usage: 83.6+ KB


In [17]:
data[data['MODE'] != 'REDACTED']

Unnamed: 0,MODE,AMPS,VOLTS,TEMP,DELTA,GAMMA,OUTPUT,POWER_high,POWER_low,UNIT_C,UNIT_K
1,beam,0.687,2.412,-17.044,0.508,-0.371,-1.062,0.0,1.0,1.0,0.0
3,auto,0.114,0.754,-43.308,-0.760,-0.109,1.847,0.0,1.0,1.0,0.0
4,burst,0.318,3.955,37.752,-0.751,0.492,1.821,1.0,0.0,1.0,0.0
5,burst,0.671,6.303,-4.164,-0.742,-0.707,-0.027,0.0,1.0,1.0,0.0
6,burst,0.923,1.070,-23.668,-0.408,1.789,-0.209,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
965,auto,0.481,0.520,20.216,-0.180,0.904,1.591,0.0,1.0,1.0,0.0
966,auto,0.350,4.257,1.331,0.991,-0.335,2.466,0.0,1.0,0.0,1.0
967,beam,0.644,1.128,-27.707,-0.529,0.034,-1.834,0.0,1.0,1.0,0.0
968,burst,0.887,1.418,-71.690,0.729,-0.503,-0.188,0.0,1.0,0.0,1.0


# Predicting mode to substitute 'REDACTED'

I think that there is something wrong with the 'REDACTED' mode...

In [18]:
from sklearn.preprocessing import LabelEncoder

df = data.copy()
df = df[df['MODE'] != 'REDACTED']
df.reset_index(drop=True, inplace=True)
df = df.drop(['OUTPUT'], axis=1)
X = df
y = df['MODE']
le = LabelEncoder()
y = le.fit_transform(y)
X = X.drop(['MODE'], axis=1)

In [19]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test data
predictions = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.37


Acc just a little above random (33%)... Maybe not a lot of info behind 'REDACTED'...