# Aykırı Değer Problemini Çözmek

In [1]:
import seaborn as sns
diamonds = sns.load_dataset("diamonds")
df = diamonds.copy()
df = df.select_dtypes(include = ["int64","float64"])
df = df.dropna()

In [2]:
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z
0,0.23,61.5,55.0,326,3.95,3.98,2.43
1,0.21,59.8,61.0,326,3.89,3.84,2.31
2,0.23,56.9,65.0,327,4.05,4.07,2.31
3,0.29,62.4,58.0,334,4.2,4.23,2.63
4,0.31,63.3,58.0,335,4.34,4.35,2.75


In [69]:
df_table = df.table
df_table.head()

0    55.0
1    61.0
2    65.0
3    58.0
4    58.0
Name: table, dtype: float64

In [70]:
Q1 = df_table.quantile(0.25)
Q3 = df_table.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

In [71]:
outliers_tf = (df_table < lower_limit) | (df_table > upper_limit)
outliers_tf

0        False
1        False
2         True
3        False
4        False
         ...  
53935    False
53936    False
53937    False
53938    False
53939    False
Name: table, Length: 53940, dtype: bool

In [72]:
df_table[outliers_tf]

2        65.0
91       69.0
145      64.0
219      64.0
227      67.0
         ... 
53695    65.0
53697    65.0
53756    64.0
53757    64.0
53785    65.0
Name: table, Length: 605, dtype: float64

In [73]:
df_table[outliers_tf].index

Int64Index([    2,    91,   145,   219,   227,   239,   296,   314,   356,
              359,
            ...
            53226, 53503, 53542, 53577, 53660, 53695, 53697, 53756, 53757,
            53785],
           dtype='int64', length=605)

In [74]:
import pandas as pd

In [75]:
type(df_table)

pandas.core.series.Series

In [76]:
# pandas df'sine çevirmeliyiz

In [77]:
df_table = pd.DataFrame(df_table)
type(df_table)

pandas.core.frame.DataFrame

## Silme

In [78]:
df_table.shape

(53940, 1)

In [79]:
clean_df_table = df_table[ ~ ((df_table < lower_limit) | (df_table > upper_limit)).any(axis = 1)]  
# ~' u ! gibi düşünebiliriz bana aykırı olmayan değerler lazım yani false olan.
# any(axis = 1) ile de sütun bazında bir işlem yaptığımız belirtmek. onu koymayınca silinmesini istediğim değerler NaN geliyor gitmiyor.

In [80]:
clean_df_table

Unnamed: 0,table
0,55.0
1,61.0
3,58.0
4,58.0
5,57.0
...,...
53935,57.0
53936,55.0
53937,60.0
53938,58.0


## Ortalama ile Doldurma

In [81]:
df_table.head()

Unnamed: 0,table
0,55.0
1,61.0
2,65.0
3,58.0
4,58.0


In [82]:
outliers_tf.head()

0    False
1    False
2     True
3    False
4    False
Name: table, dtype: bool

In [83]:
df_table[outliers_tf]

Unnamed: 0,table
2,65.0
91,69.0
145,64.0
219,64.0
227,67.0
...,...
53695,65.0
53697,65.0
53756,64.0
53757,64.0


In [84]:
df_table.mean()

table    57.457184
dtype: float64

In [85]:
clean_df_table = df_table.copy()

In [86]:
clean_df_table[outliers_tf] = df_table.mean()

In [88]:
clean_df_table[outliers_tf]

Unnamed: 0,table
2,57.457184
91,57.457184
145,57.457184
219,57.457184
227,57.457184
...,...
53695,57.457184
53697,57.457184
53756,57.457184
53757,57.457184


## Baskılama Yöntemi : Üst taraflar üst sınıra, alt taraflar alt sınıra eşitlenir.

In [89]:
df_table[outliers_tf]

Unnamed: 0,table
2,65.0
91,69.0
145,64.0
219,64.0
227,67.0
...,...
53695,65.0
53697,65.0
53756,64.0
53757,64.0


In [91]:
lower_limit

51.5

In [92]:
clean_df_table[outliers_tf] = lower_limit

In [94]:
clean_df_table[outliers_tf]

Unnamed: 0,table
2,51.5
91,51.5
145,51.5
219,51.5
227,51.5
...,...
53695,51.5
53697,51.5
53756,51.5
53757,51.5
