In [1]:
import pandas as pd

DATA = '/kaggle/input/airplane-price-dataset/airplane_price_dataset.csv'
df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,Model,Üretim Yılı,Motor Sayısı,Motor Türü,Kapasite,Menzil (km),Yakıt Tüketimi (L/saat),Saatlik Bakım Maliyeti ($),Yaş,Satış Bölgesi,Fiyat ($)
0,Bombardier CRJ200,1987,2,Turbofan,50,3000,14.36,2185.43,36,Asya,12857080.0
1,Bombardier CRJ200,1997,2,Turbofan,50,3000,4.03,1202.08,26,Avrupa,13914060.0
2,Airbus A320,1988,2,Turbofan,180,6300,13.26,761.38,35,Avustralya,90735700.0
3,Boeing 737,2023,2,Turbofan,162,5700,14.61,592.63,0,Avustralya,136659700.0
4,Cessna 172,1985,1,Piston,4,1285,18.49,4245.99,38,Güney Amerika,203798.1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12377 entries, 0 to 12376
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Model                       12377 non-null  object 
 1   Üretim Yılı                 12377 non-null  int64  
 2   Motor Sayısı                12377 non-null  int64  
 3   Motor Türü                  12377 non-null  object 
 4   Kapasite                    12377 non-null  int64  
 5   Menzil (km)                 12377 non-null  int64  
 6   Yakıt Tüketimi (L/saat)     12377 non-null  float64
 7   Saatlik Bakım Maliyeti ($)  12377 non-null  float64
 8   Yaş                         12377 non-null  int64  
 9   Satış Bölgesi               12377 non-null  object 
 10  Fiyat ($)                   12377 non-null  float64
dtypes: float64(3), int64(5), object(3)
memory usage: 1.0+ MB


We have a mix of numeric and non-numeric features; let's see what we can do with just the numeric values.

First let's make an exploratory cluster plot using dimensionality reduction; we'll use TSNE to reduce our input numeric variables to two dimensions to make a scatter plot; then we'll color using the target variable. 

In [3]:
from sklearn.manifold import TSNE

RANDOM_STATE = 2025
SOURCE = ['Üretim Yılı', 'Motor Sayısı',  'Kapasite',
       'Menzil (km)', 'Yakıt Tüketimi (L/saat)', 'Saatlik Bakım Maliyeti ($)',
       'Yaş',]
TARGET = 'Fiyat ($)'


reducer = TSNE(random_state=RANDOM_STATE)
reduced_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[SOURCE]))
reduced_df[TARGET] = df[TARGET].tolist()

In [4]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)
express.scatter(data_frame=reduced_df, x='x', y='y', color=TARGET).show(renderer='iframe_connected')

What do we see? We see that the higher prices and lower prices cluster; it isn't entirely clear that it can distinguish higher prices from the highest prices. Let's build a regression model and see.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(df[SOURCE], df[TARGET], test_size=0.2, random_state=RANDOM_STATE, shuffle=True, )
tree = DecisionTreeRegressor(random_state=RANDOM_STATE)
tree.fit(X=X_train, y=y_train)
y_pred = tree.predict(X=X_test)

We have our predictions. Let's make a DataFrame from our results and plot the predicted values against the true values and color by the error, in particular so we can see if the errors are symmetrically distributed or not.

In [6]:
import numpy as np

result_df = pd.DataFrame(data={'expected': y_test, 'actual': y_pred})
result_df['error'] = np.abs(result_df['expected'] - result_df['actual'])/result_df['expected']
express.scatter(data_frame=result_df, x='expected', y='actual', color='error').show(renderer='iframe_connected')

Let's take another look with a log plot.

In [7]:
express.scatter(data_frame=result_df, x='expected', y='actual', color='error', log_x=True, log_y=True).show(renderer='iframe_connected')

Interestingly, when we make a log plot we can clearly see that our prices fall into four apparent clusters, which is not something that was apparent in our data.

In [8]:
express.histogram(data_frame=df, x=TARGET).show(renderer='iframe_connected')

Let's take a look at our metrics.

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', int(mse))
print('RMSE:', int(rmse))
print('MAE:', int(mae))
print('R-squared: {:5.4f}'.format(r2))

MSE: 1706154546202183
RMSE: 41305623
MAE: 22045595
R-squared: 0.9678
