In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("/Users/mineozfen/Desktop/superstore.csv", encoding='ISO-8859-1')
df.head() ##İlk beş satırı döndürür
print(df.columns)

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')


In [9]:
# 2. Gereksiz Sütunları Kaldır
df.drop(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Customer ID',
         'Customer Name', 'Product ID', 'Product Name', 'Postal Code'], axis=1, inplace=True)


In [18]:
# 3. Eksik Veri Kontrolü
missing_summary = df.isnull().sum()

# 4. Kategorik Değişkenleri Sayısala Çevir (One-Hot Encoding)
df = pd.get_dummies(df, drop_first=True)
print(df)

        Sales  Quantity  Discount   Profit  Ship Mode_Same Day  \
0     261.960         2       0.0  41.9136               False   
2      14.620         2       0.0   6.8714               False   
4      22.368         2       0.2   2.5164               False   
5      48.860         7       0.0  14.1694               False   
6       7.280         4       0.0   1.9656               False   
...       ...       ...       ...      ...                 ...   
9988  206.100         5       0.0  55.6470               False   
9989   25.248         3       0.2   4.1028               False   
9990   91.960         2       0.0  15.6332               False   
9991  258.576         2       0.2  19.3932               False   
9992   29.600         4       0.0  13.3200               False   

      Ship Mode_Second Class  Ship Mode_Standard Class  Segment_Corporate  \
0                       True                     False              False   
2                       True                     Fals

In [20]:
# 5. Aykırı Değerleri İnceleme ve Temizleme (Sales ve Profit için)
Q1 = df['Profit'].quantile(0.25)
Q3 = df['Profit'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Profit'] >= lower_bound) & (df['Profit'] <= upper_bound)]


In [12]:
# 6. Hedef ve Özelliklerin Ayrılması
X = df.drop(['Profit'], axis=1)
y = df['Profit']

In [21]:
# 7. Ölçekleme
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [22]:
# 8. Veri Bölme (Train-Validation-Test: %60-%20-%20)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 9. Modellerin Eğitilmesi ve Karşılaştırılması
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest": RandomForestRegressor(random_state=42)
}

results = []

In [25]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    results.append((name, rmse))

# Sonuçları DataFrame olarak göster
results_df = pd.DataFrame(results, columns=["Model", "Validation RMSE"])
print(results_df)


               Model  Validation RMSE
0  Linear Regression        15.713317
1   Ridge Regression        15.711462
2      Random Forest         9.738746
3  Linear Regression        15.713317
4   Ridge Regression        15.711462
5      Random Forest         9.738746
6  Linear Regression        15.713317
7   Ridge Regression        15.711462
8      Random Forest         9.738746
