In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("plane.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Model Name               517 non-null    object 
 1   Engine Type              517 non-null    object 
 2   HP or lbs thr ea engine  517 non-null    object 
 3   Max speed Knots          497 non-null    object 
 4   Rcmnd cruise Knots       507 non-null    float64
 5   Stall Knots dirty        502 non-null    float64
 6   Fuel gal/lbs             517 non-null    int64  
 7   All eng rate of climb    513 non-null    object 
 8   Eng out rate of climb    491 non-null    float64
 9   Takeoff over 50ft        492 non-null    float64
 10  Landing over 50ft        517 non-null    object 
 11  Empty weight lbs         516 non-null    object 
 12  Length ft/in             517 non-null    object 
 13  Wing span ft/in          517 non-null    object 
 14  Range N.M.               4

Data Preprocessing

In [5]:
df["Engine Type"].unique()

array(['Piston', 'Propjet', 'Jet', 'piston'], dtype=object)

In [6]:
df = pd.get_dummies(df,columns=["Engine Type"])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Model Name               517 non-null    object 
 1   HP or lbs thr ea engine  517 non-null    object 
 2   Max speed Knots          497 non-null    object 
 3   Rcmnd cruise Knots       507 non-null    float64
 4   Stall Knots dirty        502 non-null    float64
 5   Fuel gal/lbs             517 non-null    int64  
 6   All eng rate of climb    513 non-null    object 
 7   Eng out rate of climb    491 non-null    float64
 8   Takeoff over 50ft        492 non-null    float64
 9   Landing over 50ft        517 non-null    object 
 10  Empty weight lbs         516 non-null    object 
 11  Length ft/in             517 non-null    object 
 12  Wing span ft/in          517 non-null    object 
 13  Range N.M.               499 non-null    object 
 14  Price                    5

In [8]:
def convert_to_numeric(column):
    # Virgülleri kaldırma
    df[column] = df[column].str.replace(',', '', regex=False)
    
    # Fonksiyon, sadece sayıları içeren bir string döndürür
    def extract_numbers(s):
        match = re.match(r'[-+]?\d*\.\d+|\d+', s)
        return float(match.group(0)) if match else None
    
    # Geçersiz değerleri tespit etme
    mask = df[column].notna() & df[column].str.contains('[^0-9.-]', regex=True)
    invalid_entries = df.loc[mask, column]
    if not invalid_entries.empty:
        print(f"Geçersiz değerler: {column}")
        print(invalid_entries)
    
    # Geçerli sayıları alma
    df[column] = df[column].apply(lambda x: extract_numbers(x) if pd.notna(x) else None)

df.iloc[453]

Model Name                 PA-18 150 Super Cub
HP or lbs thr ea engine                    325
Max speed Knots                            260
Rcmnd cruise Knots                       250.0
Stall Knots dirty                         77.0
Fuel gal/lbs                               165
All eng rate of climb                    1,955
Eng out rate of climb                   1980.0
Takeoff over 50ft                       2076.0
Landing over 50ft                         Orig
Empty weight lbs                           200
Length ft/in                              Orig
Wing span ft/in                           Orig
Range N.M.                               1,137
Price                                      NaN
Engine Type_Jet                          False
Engine Type_Piston                        True
Engine Type_Propjet                      False
Engine Type_piston                       False
Name: 453, dtype: object

In [9]:
# Virgülle ayrılmış sayıları dönüştürme
numeric_columns_with_comma = [
    'All eng rate of climb','Landing over 50ft',
    'Empty weight lbs','Range N.M.'
]
for column in numeric_columns_with_comma:
    convert_to_numeric(column)
df.head()

Geçersiz değerler: All eng rate of climb
106    1030 w/3bld
109    1167 w/3bld
124    1167 w/3bld
Name: All eng rate of climb, dtype: object
Geçersiz değerler: Landing over 50ft
453    Orig
Name: Landing over 50ft, dtype: object


Unnamed: 0,Model Name,HP or lbs thr ea engine,Max speed Knots,Rcmnd cruise Knots,Stall Knots dirty,Fuel gal/lbs,All eng rate of climb,Eng out rate of climb,Takeoff over 50ft,Landing over 50ft,Empty weight lbs,Length ft/in,Wing span ft/in,Range N.M.,Price,Engine Type_Jet,Engine Type_Piston,Engine Type_Propjet,Engine Type_piston
0,100 Darter (S.L. Industries),145,104,91.0,46.0,36,450.0,900.0,1300.0,2050.0,1180.0,25/3,37/5,370.0,1300000.0,False,True,False,False
1,7 CCM Champ,85,89,83.0,44.0,15,600.0,720.0,800.0,1350.0,820.0,20/7,36/1,190.0,1230000.0,False,True,False,False
2,100 Darter (S.L. Industries),90,90,78.0,37.0,19,650.0,475.0,850.0,1300.0,810.0,21/5,35/0,210.0,1600000.0,False,True,False,False
3,7 AC Champ,85,88,78.0,37.0,19,620.0,500.0,850.0,1300.0,800.0,21/5,35/0,210.0,1300000.0,False,True,False,False
4,100 Darter (S.L. Industries),65,83,74.0,33.0,14,370.0,632.0,885.0,1220.0,740.0,21/5,35/0,175.0,1250000.0,False,True,False,False


In [10]:
df.iloc[453]

Model Name                 PA-18 150 Super Cub
HP or lbs thr ea engine                    325
Max speed Knots                            260
Rcmnd cruise Knots                       250.0
Stall Knots dirty                         77.0
Fuel gal/lbs                               165
All eng rate of climb                   1955.0
Eng out rate of climb                   1980.0
Takeoff over 50ft                       2076.0
Landing over 50ft                          NaN
Empty weight lbs                         200.0
Length ft/in                              Orig
Wing span ft/in                           Orig
Range N.M.                              1137.0
Price                                      NaN
Engine Type_Jet                          False
Engine Type_Piston                        True
Engine Type_Propjet                      False
Engine Type_piston                       False
Name: 453, dtype: object

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Model Name               517 non-null    object 
 1   HP or lbs thr ea engine  517 non-null    object 
 2   Max speed Knots          497 non-null    object 
 3   Rcmnd cruise Knots       507 non-null    float64
 4   Stall Knots dirty        502 non-null    float64
 5   Fuel gal/lbs             517 non-null    int64  
 6   All eng rate of climb    513 non-null    float64
 7   Eng out rate of climb    491 non-null    float64
 8   Takeoff over 50ft        492 non-null    float64
 9   Landing over 50ft        516 non-null    float64
 10  Empty weight lbs         516 non-null    float64
 11  Length ft/in             517 non-null    object 
 12  Wing span ft/in          517 non-null    object 
 13  Range N.M.               499 non-null    float64
 14  Price                    5

In [12]:
numeric_columns = [
    'HP or lbs thr ea engine','Max speed Knots',
]
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Model Name               517 non-null    object 
 1   HP or lbs thr ea engine  439 non-null    float64
 2   Max speed Knots          488 non-null    float64
 3   Rcmnd cruise Knots       507 non-null    float64
 4   Stall Knots dirty        502 non-null    float64
 5   Fuel gal/lbs             517 non-null    int64  
 6   All eng rate of climb    513 non-null    float64
 7   Eng out rate of climb    491 non-null    float64
 8   Takeoff over 50ft        492 non-null    float64
 9   Landing over 50ft        516 non-null    float64
 10  Empty weight lbs         516 non-null    float64
 11  Length ft/in             517 non-null    object 
 12  Wing span ft/in          517 non-null    object 
 13  Range N.M.               499 non-null    float64
 14  Price                    5

In [14]:
data = [
    'Length ft/in',
    'Wing span ft/in'
]

In [15]:
def convert_ft_in_to_float(ft_in):
    if pd.isna(ft_in) or '/' not in ft_in:
        return float('nan')
    try:
        feet, inches = ft_in.split('/')
        feet = float(feet)
        inches = float(inches)
        total_feet = feet + (inches / 12)  # 1 foot = 12 inches
        return total_feet
    except ValueError:
        return float('nan')


In [16]:
for column in data:
    df[column] = df[column].apply(convert_ft_in_to_float)


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Model Name               517 non-null    object 
 1   HP or lbs thr ea engine  439 non-null    float64
 2   Max speed Knots          488 non-null    float64
 3   Rcmnd cruise Knots       507 non-null    float64
 4   Stall Knots dirty        502 non-null    float64
 5   Fuel gal/lbs             517 non-null    int64  
 6   All eng rate of climb    513 non-null    float64
 7   Eng out rate of climb    491 non-null    float64
 8   Takeoff over 50ft        492 non-null    float64
 9   Landing over 50ft        516 non-null    float64
 10  Empty weight lbs         516 non-null    float64
 11  Length ft/in             513 non-null    float64
 12  Wing span ft/in          509 non-null    float64
 13  Range N.M.               499 non-null    float64
 14  Price                    5

In [18]:
df[df['HP or lbs thr ea engine'].isna()]

Unnamed: 0,Model Name,HP or lbs thr ea engine,Max speed Knots,Rcmnd cruise Knots,Stall Knots dirty,Fuel gal/lbs,All eng rate of climb,Eng out rate of climb,Takeoff over 50ft,Landing over 50ft,Empty weight lbs,Length ft/in,Wing span ft/in,Range N.M.,Price,Engine Type_Jet,Engine Type_Piston,Engine Type_Propjet,Engine Type_piston
47,C 50 Twin Bonanza,,335.0,328.0,,560,3225.0,3876.0,2630.0,14400.0,9887.0,46.083333,54.416667,1722.0,3480000.0,False,False,True,False
48,50 Twin Bonanza,,314.0,303.0,,539,2979.0,,2508.0,15000.0,9051.0,46.666667,57.916667,1966.0,3500000.0,False,False,True,False
152,402C Business Liner II,,455.0,417.0,,6707,4059.0,5600.0,3300.0,21500.0,12135.0,55.083333,43.750000,1715.0,5100000.0,True,False,False,False
153,"402,-A turbocharged",,457.0,,,6707,4380.0,4540.0,3109.0,19500.0,12130.0,55.083333,43.750000,1715.0,5100000.0,True,False,False,False
154,100 Darter (S.L. Industries),,480.0,442.0,,16665,4200.0,5400.0,3300.0,43250.0,19950.0,68.416667,61.833333,3391.0,4100000.0,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,100 Darter (S.L. Industries),,471.0,426.0,82.0,6250,4400.0,5400.0,3900.0,17500.0,10500.0,50.416667,43.250000,950.0,4000000.0,True,False,False,False
449,"PA-25 235 C,D Pawnee (normal category)",,496.0,435.0,106.0,2660,4050.0,,3440.0,41900.0,20999.0,60.416667,54.416667,,4300000.0,True,False,False,False
468,100 Darter (S.L. Industries),,456.0,438.0,,4260,3180.0,3950.0,2930.0,15780.0,9925.0,48.416667,43.500000,1530.0,4100000.0,True,False,False,False
469,"200 D Meyers (Prop Jets, Inc.)",,430.0,400.0,77.0,4260,3050.0,3850.0,2800.0,14630.0,9410.0,48.416667,43.500000,1513.0,4100000.0,True,False,False,False


In [19]:
#Eksik değerleri KNN agoritması ile Engine Typelara göre doldurma işlemi

engine_type_columns = ['Engine Type_Jet', 'Engine Type_Piston', 'Engine Type_Propjet', 'Engine Type_piston']
df[engine_type_columns] = df[engine_type_columns].astype(int)

df_numeric = df.drop(['Model Name'], axis=1)
df_filled = pd.DataFrame()


In [20]:
for engine_type in engine_type_columns:
    print(f"Processing {engine_type} category...")
    
    # Motor tipi için veri kümesini filtreleme
    df_engine = df[df[engine_type] == 1].copy()
    
    # Kategorik verileri ve sayısal verileri ayırma
    df_engine_numeric = df_engine.select_dtypes(include=['float64', 'int64'])
    
    # Eksik verileri KNN ile doldurma
    knn_imputer = KNNImputer(n_neighbors=4)
    df_engine_numeric_imputed = knn_imputer.fit_transform(df_engine_numeric)
    
    # Doldurulmuş sayısal verileri DataFrame'e geri dönüştürme
    df_engine_numeric_imputed = pd.DataFrame(df_engine_numeric_imputed, columns=df_engine_numeric.columns, index=df_engine.index)
    
    # Kategorik verilerle birleştirme
    df_engine_imputed = pd.concat([df[df[engine_type] == 1].drop(columns=df_engine_numeric.columns), df_engine_numeric_imputed], axis=1)
    
    # Doldurulmuş verileri ana veri kümesine ekleme
    if 'df_filled' in locals():
        df_filled = pd.concat([df_filled, df_engine_imputed])
    else:
        df_filled = df_engine_imputed

Processing Engine Type_Jet category...
Processing Engine Type_Piston category...
Processing Engine Type_Propjet category...
Processing Engine Type_piston category...


In [21]:
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517 entries, 152 to 332
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Model Name               517 non-null    object 
 1   Engine Type_Jet          517 non-null    int32  
 2   Engine Type_Piston       517 non-null    int32  
 3   Engine Type_Propjet      517 non-null    int32  
 4   Engine Type_piston       517 non-null    int32  
 5   HP or lbs thr ea engine  517 non-null    float64
 6   Max speed Knots          517 non-null    float64
 7   Rcmnd cruise Knots       517 non-null    float64
 8   Stall Knots dirty        517 non-null    float64
 9   Fuel gal/lbs             517 non-null    float64
 10  All eng rate of climb    517 non-null    float64
 11  Eng out rate of climb    517 non-null    float64
 12  Takeoff over 50ft        517 non-null    float64
 13  Landing over 50ft        517 non-null    float64
 14  Empty weight lbs         517 

In [22]:
df_filled[df_filled['HP or lbs thr ea engine'].isna()]

Unnamed: 0,Model Name,Engine Type_Jet,Engine Type_Piston,Engine Type_Propjet,Engine Type_piston,HP or lbs thr ea engine,Max speed Knots,Rcmnd cruise Knots,Stall Knots dirty,Fuel gal/lbs,All eng rate of climb,Eng out rate of climb,Takeoff over 50ft,Landing over 50ft,Empty weight lbs,Length ft/in,Wing span ft/in,Range N.M.,Price


In [23]:
df_filled.head()

Unnamed: 0,Model Name,Engine Type_Jet,Engine Type_Piston,Engine Type_Propjet,Engine Type_piston,HP or lbs thr ea engine,Max speed Knots,Rcmnd cruise Knots,Stall Knots dirty,Fuel gal/lbs,All eng rate of climb,Eng out rate of climb,Takeoff over 50ft,Landing over 50ft,Empty weight lbs,Length ft/in,Wing span ft/in,Range N.M.,Price
152,402C Business Liner II,1,0,0,0,1008.666667,455.0,417.0,95.0,6707.0,4059.0,5600.0,3300.0,21500.0,12135.0,55.083333,43.75,1715.0,5100000.0
153,"402,-A turbocharged",1,0,0,0,1008.666667,457.0,445.75,91.25,6707.0,4380.0,4540.0,3109.0,19500.0,12130.0,55.083333,43.75,1715.0,5100000.0
154,100 Darter (S.L. Industries),1,0,0,0,1008.666667,480.0,442.0,82.0,16665.0,4200.0,5400.0,3300.0,43250.0,19950.0,68.416667,61.833333,3391.0,4100000.0
155,100 Darter (S.L. Industries),1,0,0,0,1008.666667,480.0,442.0,82.0,14890.0,3400.0,5700.0,3300.0,41400.0,18660.0,68.416667,61.833333,2991.0,4100000.0
156,T 310 P turbocharged,1,0,0,0,1008.666667,491.0,460.0,99.0,7400.0,4340.0,4972.0,3075.0,18300.0,9838.0,48.666667,39.5,2289.0,4500000.0


In [24]:
#En iyi Öğrenme Methodunu seçme
df2 = df_filled.drop('Model Name', axis=1)
X = df2.drop('Price', axis=1) 
y = df2['Price'] 

In [25]:
model_lr = LinearRegression()
model_rf = RandomForestRegressor(n_estimators=100 ,random_state = 35)

In [26]:
#cross-validation
k = 20 

cv_scores_lr = cross_val_score(model_lr, X, y, cv=k, scoring='neg_mean_squared_error')

cv_scores_rf = cross_val_score(model_rf, X, y, cv=k, scoring='neg_mean_squared_error')

print(f"Linear Regression - Mean Squared Error: {-cv_scores_lr.mean()} (+/- {cv_scores_lr.std()})")
print(f"Random Forest - Mean Squared Error: {-cv_scores_rf.mean()} (+/- {cv_scores_rf.std()})")

Linear Regression - Mean Squared Error: 134017352223.40652 (+/- 133808929978.2595)
Random Forest - Mean Squared Error: 89122044147.078 (+/- 53758718312.5223)


In [51]:
#En iyi traizn size ı bulma
best_train_size = None
best_rf_score = -float('inf')

# Test boyutlarını belirleyin
for t_size in range(50, 100):
    train_size = t_size / 100
    
    # Veri setini bölün
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=train_size, random_state=35)

    # Normalizasyon
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    random_forest_model = model_rf.fit(X_train_scaled, Y_train)

    rf_score = random_forest_model.score(X_test_scaled, Y_test)
        
    if rf_score > best_rf_score:
        best_rf_score = rf_score
        best_train_size = train_size

print(f"Best Train Size for Random Forest: {best_train_size:.2f} with Score: {best_rf_score:.5f}")


Best Train Size for Random Forest: 0.75 with Score: 0.94368


In [52]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=best_train_size, random_state=35)

In [53]:
#Normalizasyon
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
random_forest_model = model_rf.fit(X_train_scaled, Y_train)

In [55]:
random_forest_model.score(X_test_scaled, Y_test)

0.9436838025955543