In [1]:
import pandas as pd

DATA = '/kaggle/input/airplane-price-dataset/airplane_price_dataset.csv'
df = pd.read_csv(filepath_or_buffer=DATA)
df.head()

Unnamed: 0,Model,Üretim Yılı,Motor Sayısı,Motor Türü,Kapasite,Menzil (km),Yakıt Tüketimi (L/saat),Saatlik Bakım Maliyeti ($),Yaş,Satış Bölgesi,Fiyat ($)
0,Bombardier CRJ200,1987,2,Turbofan,50,3000,14.36,2185.43,36,Asya,12857080.0
1,Bombardier CRJ200,1997,2,Turbofan,50,3000,4.03,1202.08,26,Avrupa,13914060.0
2,Airbus A320,1988,2,Turbofan,180,6300,13.26,761.38,35,Avustralya,90735700.0
3,Boeing 737,2023,2,Turbofan,162,5700,14.61,592.63,0,Avustralya,136659700.0
4,Cessna 172,1985,1,Piston,4,1285,18.49,4245.99,38,Güney Amerika,203798.1


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12377 entries, 0 to 12376
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Model                       12377 non-null  object 
 1   Üretim Yılı                 12377 non-null  int64  
 2   Motor Sayısı                12377 non-null  int64  
 3   Motor Türü                  12377 non-null  object 
 4   Kapasite                    12377 non-null  int64  
 5   Menzil (km)                 12377 non-null  int64  
 6   Yakıt Tüketimi (L/saat)     12377 non-null  float64
 7   Saatlik Bakım Maliyeti ($)  12377 non-null  float64
 8   Yaş                         12377 non-null  int64  
 9   Satış Bölgesi               12377 non-null  object 
 10  Fiyat ($)                   12377 non-null  float64
dtypes: float64(3), int64(5), object(3)
memory usage: 1.0+ MB


In [3]:
df.columns

Index(['Model', 'Üretim Yılı', 'Motor Sayısı', 'Motor Türü', 'Kapasite',
       'Menzil (km)', 'Yakıt Tüketimi (L/saat)', 'Saatlik Bakım Maliyeti ($)',
       'Yaş', 'Satış Bölgesi', 'Fiyat ($)'],
      dtype='object')

In [4]:
from sklearn.manifold import TSNE

RANDOM_STATE = 2025
SOURCE = ['Üretim Yılı', 'Motor Sayısı',  'Kapasite',
       'Menzil (km)', 'Yakıt Tüketimi (L/saat)', 'Saatlik Bakım Maliyeti ($)',
       'Yaş',]
TARGET = 'Fiyat ($)'


reducer = TSNE(random_state=RANDOM_STATE)
reduced_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[SOURCE]))
reduced_df[TARGET] = df[TARGET].tolist()

In [5]:
from plotly import express

express.scatter(data_frame=reduced_df, x='x', y='y', color=TARGET)

What do we see? We see that the higher prices and lower prices cluster; it isn't entirely clear that it can distinguish higher prices from the highest prices. Let's build a regression model and see.

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(df[SOURCE], df[TARGET], test_size=0.2, random_state=RANDOM_STATE, shuffle=True, )
tree = DecisionTreeRegressor(random_state=RANDOM_STATE)
tree.fit(X=X_train, y=y_train)
y_pred = tree.predict(X=X_test)

In [7]:
import numpy as np

result_df = pd.DataFrame(data={'expected': y_test, 'actual': y_pred})
result_df['error'] = np.abs(result_df['expected'] - result_df['actual'])/result_df['expected']
express.scatter(data_frame=result_df, x='expected', y='actual', color='error')

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", int(mse))
print("RMSE:", int(rmse))
print("MAE:", int(mae))
print("R-squared: {:5.4f}".format(r2))

MSE: 1706154546202183
RMSE: 41305623
MAE: 22045595
R-squared: 0.9678
