Genel olarak yapacağınız adımlar şunlar olacak:

    1- Kullanacağınız veriyi indirip, okumak
    2- Verinizin içindeki eksik ve kategorik değişkenler ile ilgilenip modele besleyeceğimiz hale getirmek 
    3- İlgilendiğiniz probleme göre error metriğine karar vermek (derste gördüğümüz RMSE-RMSLE gibi)
    4- Verinizi train-validation-test diye bölmek (burada validation ve test'in gerçek hayatı yansıtması çok önemli)
    5- Olabildiğince fazla model denemek ve metriğimizde en iyi yapanı seçmek

# Gerekli Kütüphaneler

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# 1 - Veriyi indirip, okumak

In [2]:
data = pd.read_csv("data_with_nans.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,2,4.9,3.1,1.5,0.1,Iris-setosa
3,3,3,4.4,2.9,1.4,0.2,Iris-setosa
4,4,4,5.0,3.4,1.5,0.2,Iris-setosa


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     170 non-null    int64  
 1   Id             170 non-null    int64  
 2   SepalLengthCm  167 non-null    float64
 3   SepalWidthCm   169 non-null    float64
 4   PetalLengthCm  166 non-null    float64
 5   PetalWidthCm   168 non-null    float64
 6   Species        170 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 9.4+ KB


# 2 - Eksik ve gereksiz verilerin düşürülmesi

In [5]:
data.columns

Index(['Unnamed: 0', 'Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm',
       'PetalWidthCm', 'Species'],
      dtype='object')

In [7]:
data.drop(labels=[data.columns[0]], axis=1, inplace=True)

In [9]:
data.tail(20)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
150,5.8,2.7,5.1,1.9,Iris-virginica
151,6.8,3.2,5.9,2.3,Iris-virginica
152,6.7,3.3,5.7,2.5,Iris-virginica
153,6.3,2.5,5.0,1.9,Iris-virginica
154,6.3,2.7,4.9,1.8,Iris-virginica
155,6.5,3.0,5.2,2.0,Iris-virginica
156,6.2,3.4,5.4,2.3,Iris-virginica
157,5.9,3.0,5.1,1.8,Iris-virginica
158,6.4,2.8,5.6,2.2,Iris-virginica
159,7.7,3.8,6.7,2.2,Iris-virginica


In [16]:
data.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,167.0,170.0,170.0,170.0
mean,5.826347,3.072781,3.707229,1.180357
std,0.827189,0.446252,1.768079,0.760509
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.5,0.3
50%,5.8,3.0,4.2,1.3
75%,6.4,3.4,5.1,1.8
max,7.9,4.4,6.9,2.5


In [17]:
data.groupby("Species").agg(["min", "max", "std", "mean"])

Unnamed: 0_level_0,SepalLengthCm,SepalLengthCm,SepalLengthCm,SepalLengthCm,SepalWidthCm,SepalWidthCm,SepalWidthCm,SepalWidthCm,PetalLengthCm,PetalLengthCm,PetalLengthCm,PetalLengthCm,PetalWidthCm,PetalWidthCm,PetalWidthCm,PetalWidthCm
Unnamed: 0_level_1,min,max,std,mean,min,max,std,mean,min,max,std,mean,min,max,std,mean
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Iris-setosa,4.3,6.9,0.429014,5.034483,2.3,4.4,0.38954,3.384546,1.0,5.4,0.737919,1.643574,0.1,2.1,0.314673,0.314673
Iris-versicolor,4.9,7.0,0.516189,5.932075,2.0,4.4,0.382355,2.805556,1.5,5.1,0.618195,4.17249,0.4,1.8,0.235049,1.314815
Iris-virginica,4.9,7.9,0.650025,6.546429,2.2,3.9,0.355203,2.996429,1.3,6.9,0.795195,5.469643,0.4,2.5,0.366439,1.978221


In [18]:
data.isna().sum()

SepalLengthCm    3
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [19]:
for column in data.columns[1:-1]:
    data[column].fillna(value=data[column].mean(), inplace=True)

In [20]:
data.isna().sum()

SepalLengthCm    3
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [21]:
data.dropna(axis=0, how="any", inplace=True)

In [22]:
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklearn'

In [23]:

X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data.iloc[:, -1], test_size=0.2)

NameError: name 'train_test_split' is not defined

In [None]:
y_train