In [1]:
import pandas as pd
import numpy as np

FUEL = '/kaggle/input/fuel-dataset/fuel.csv'

df = pd.read_csv(filepath_or_buffer=FUEL)
df.head()

Unnamed: 0,EngDispl,NumCyl,Transmission,FE,AirAspirationMethod,NumGears,TransLockup,TransCreeperGear,DriveDesc,IntakeValvePerCyl,ExhaustValvesPerCyl,CarlineClassDesc,VarValveTiming,VarValveLift
0,4.7,8,AM6,28.0198,NaturallyAspirated,6,1,0,TwoWheelDriveRear,2,2,2Seaters,1,0
1,4.7,8,M6,25.6094,NaturallyAspirated,6,1,0,TwoWheelDriveRear,2,2,2Seaters,1,0
2,4.2,8,M6,26.8,NaturallyAspirated,6,1,0,AllWheelDrive,2,2,2Seaters,1,0
3,4.2,8,AM6,25.0451,NaturallyAspirated,6,1,0,AllWheelDrive,2,2,2Seaters,1,0
4,5.2,10,AM6,24.8,NaturallyAspirated,6,0,0,AllWheelDrive,2,2,2Seaters,1,0


In [2]:
from sklearn.manifold import TSNE
from plotly.express import scatter

tsne = TSNE(verbose=1, random_state=2024)
tsne_columns = ['EngDispl', 'NumCyl',  'NumGears', 'TransLockup', 'TransCreeperGear', 'IntakeValvePerCyl', 'ExhaustValvesPerCyl', 'VarValveTiming', 'VarValveLift']
tsne_df = pd.DataFrame(data=tsne.fit_transform(X=df[tsne_columns]), columns=['tx', 'ty'])
tsne_df['target'] = df['FE'].copy()
scatter(data_frame=tsne_df, x='tx', y='ty', color='target')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1107 samples in 0.001s...
[t-SNE] Computed neighbors for 1107 samples in 0.029s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1107
[t-SNE] Computed conditional probabilities for sample 1107 / 1107
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 49.923016
[t-SNE] KL divergence after 1000 iterations: -0.287160


This initial sniff suggests there's some signal in our data. Let's build a simple regression model to see how much.

In [3]:
from plotly.express import histogram
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
TARGET = 'FE'

def do_model(input_df: pd.DataFrame, columns: list):
    model_df = input_df[columns + [TARGET]].copy()
    X_train, X_test, y_train, y_test = train_test_split(model_df[columns], model_df[TARGET].values, test_size=0.2, random_state=2024)
    model = LinearRegression(fit_intercept=True,).fit(X=X_train, y=y_train)
    return r2_score(y_true=y_test, y_pred = model.predict(X=X_test)), model.coef_

r2, weights = do_model(input_df=df, columns=tsne_columns)
print('R2: {:6.4f}'.format(r2))
histogram(x=tsne_columns, y=weights)

R2: 0.6860


Can we improve our R2 by using a proper subset of the numerical columns? Let's find out.

In [4]:
# https://docs.python.org/2/library/itertools.html#recipes

from itertools import chain
from itertools import combinations
def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

# step through all the combinations of two or more variables and see if any produce a better R2 than all the columns do
best = r2
count = 0
for columns in powerset(tsne_columns):
    if len(columns) > 1:
        count += 1
        r2_result, weights = do_model(input_df=df, columns=list(columns))
        if r2_result >= best:
            best = r2_result
            print('{:6.5} {}'.format(best, columns))
print('we tested {} combinations of numerical columns.'.format(count))

 0.686 ('EngDispl', 'NumCyl', 'NumGears', 'TransLockup', 'TransCreeperGear', 'IntakeValvePerCyl', 'ExhaustValvesPerCyl', 'VarValveTiming', 'VarValveLift')
we tested 502 combinations of numerical columns.


Sadly no; this is about the best we're going to do with linear regression on our numerical variables. Let's use dummies to leverage all the data.

In [5]:
input_df = pd.get_dummies(df)
all_columns = [column for column in input_df.columns if column != 'FE']
all_r2, weights = do_model(input_df=input_df, columns=all_columns)
print('R2: {:6.4f}'.format(all_r2))
histogram(y=all_columns, x=weights, height=900)

R2: 0.8281


Once we introduce dummy variables we have way too many columns to run our powerset analysis, but we can try something similar using the categorical variables.

In [6]:
blocks = df[[column for column in df.columns if column not in tsne_columns and column != 'FE']].columns.tolist()
cases = [list(item) for item in list(powerset(blocks)) if 0 < len(item) < 4]
best_case = 0.0 # start from zero in case no cases are better
for case in cases:
    case_df = pd.get_dummies(df[case + tsne_columns + ['FE']])
    result = do_model(input_df=case_df, columns=[column for column in case_df.columns if column != 'FE'])
    if result[0] > best_case:
        best_case = result[0]
        print('{:5.4f} {}'.format(best_case, case))
if best_case > all_r2:
    print('new best combination found')
else:
    print('using all variables is best.')

0.6964 ['Transmission']
0.7536 ['DriveDesc']
0.7942 ['CarlineClassDesc']
0.7967 ['Transmission', 'CarlineClassDesc']
0.7999 ['AirAspirationMethod', 'CarlineClassDesc']
0.8225 ['DriveDesc', 'CarlineClassDesc']
0.8279 ['Transmission', 'DriveDesc', 'CarlineClassDesc']
using all variables is best.
