# Tek Değişkenli Aykırı Gözlem Analizi

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import seaborn as sns
diamonds = sns.load_dataset('diamonds') 
df = diamonds.copy()
df = df.select_dtypes(include = ['float64', 'int64']) 
df.head()

In [None]:
df_table = df["table"].copy()

In [None]:
sns.boxplot(x = df_table)

In [None]:
Q1 = df_table.quantile(0.25)
Q3 = df_table.quantile(0.75)
IQR = Q3 - Q1

alt_sinir = Q1- 1.5*IQR
alt_sinir

ust_sinir = Q3 + 1.5*IQR
ust_sinir

In [None]:
(df_table < (alt_sinir)) | (df_table > (ust_sinir))

In [None]:
df_table < (alt_sinir)

In [None]:
aykiri_tf = df_table < (alt_sinir)

In [None]:
aykiri_tf[0:10]

In [None]:
aykirilar = df_table[aykiri_tf]
aykirilar.index

# Aykırı Değer Probleminin Çözülmesi

In [None]:
import pandas as pd

In [None]:
df_table.head()
type(df_table)
df_table.shape

In [None]:
temiz_df_table = df_table[~((df_table < (alt_sinir)) | (df_table > (ust_sinir))).any(axis = 1)]


In [None]:
temiz_df_table.shape

In [None]:
df_table = df["table"].copy()

In [None]:
sns.boxplot(x = df_table)

In [None]:
df_table[aykiri_tf]

In [None]:
df_table.mean()

In [None]:
df_table[aykiri_tf] = df_table.mean()

In [None]:
df_table[aykiri_tf]

In [None]:
aykiri_tf = (df_table < (alt_sinir)) | (df_table > (ust_sinir))

In [None]:
df_table[aykiri_tf].head()

In [None]:
df_table.describe()

In [None]:
df_table[aykiri_tf] = df_table.mean()

In [None]:
df_table.describe()

In [None]:
df_table = df["table"].copy()

In [None]:
aykiri_tf = df_table < (alt_sinir)

In [None]:
df_table[aykiri_tf]

In [None]:
df_table[aykiri_tf] = alt_sinir 

In [None]:
df_table[aykiri_tf]

# Çok Değişkenli Aykırı Gözlem Analizi

## Local Outlier Factor

In [None]:
from IPython.display import Image
Image(filename =  "lof_intuition.png" , width=400, height=400)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor

np.random.seed(42)
X_inliers = np.random.normal(70, 3, (100, 2))

X_inliers = np.r_[X_inliers + 10, X_inliers - 10] 

print(X_inliers.shape)
print(X_inliers[:3,:2])

In [None]:
X_outliers = np.random.uniform(low=15, high=130, size=(20, 2))

In [None]:
X_outliers

In [None]:
X = np.r_[X_inliers, X_outliers]

In [None]:
X[0:3,:]

In [None]:
LOF = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)

In [None]:
LOF.fit_predict(X)

In [None]:
X_score = LOF.negative_outlier_factor_

In [None]:
X_score[0:3]

In [None]:
X_score.mean()

In [None]:
X_score.std()

In [None]:
np.sort(X_score)[0:10]

In [None]:
plt.hist(X_score, bins = "auto", density = True)
plt.show

In [None]:
plt.scatter(X[:,0], X[:,1], color = "k", s = 3, label = "Gözlem Birimleri");

In [None]:
radius = radius = (X_score.max() - X_score) / (X_score.max() - X_score.min())

In [None]:
plt.scatter(X[:,0], X[:,1], color = "k", s = 3, label = "Gözlem Birimleri");

plt.scatter(X[:, 0], X[:, 1], s = 1000 * radius, edgecolors='r', 
            facecolors='none',label='LOF Skorları')

plt.xlim((10,100))
plt.ylim((10,100))

legend = plt.legend(loc = "upper left")

legend.legendHandles[0]._sizes = [10]
legend.legendHandles[1]._sizes = [30]

In [None]:
X[0:3]

In [None]:
np.sort(X_score)[0:9]

In [None]:
esik_deger = np.sort(X_score)[9]
esik_deger

In [None]:
(X_score > esik_deger)[200:220]

In [None]:
tf_vektor = (X_score > esik_deger)

In [None]:
X[X_score < esik_deger]

In [None]:
X[~tf_vektor]

In [None]:
X[X_score < esik_deger]

In [None]:
X[200:220]

# Aykırı Gözlem Problemini Çözmek

In [None]:
df = X[X_score > esik_deger]

In [None]:
df[0:10]

In [None]:
df_X = X.copy()

In [None]:
np.mean(df_X[0])
np.mean(df_X[1])

In [None]:
df_X[~tf_vektor]

In [None]:
aykirilar = df_X[~tf_vektor]

In [None]:
aykirilar[:,:1]

In [None]:
aykirilar[:,:1] = np.mean(df_X[0])

In [None]:
aykirilar[:,1:2] = np.mean(df_X[1])

In [None]:
aykirilar

In [None]:
df_X[~tf_vektor] = aykirilar

In [None]:
df_X[~tf_vektor]

In [None]:
df_X = X.copy()

In [None]:
df_X[~tf_vektor]

In [None]:
df_X[X_score == esik_deger]

In [None]:
df_X[~tf_vektor] = df_X[X_score == esik_deger]

In [None]:
df_X[~tf_vektor]

# Eksik Veri - Hızlı Çözüm

In [None]:
import numpy as np
import pandas as pd

V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.isnull().sum()

In [None]:
df.dropna()

In [None]:
df

In [None]:
dff = df.dropna()

In [None]:
dff.isnull().sum()

In [None]:
df["V1"].mean()

In [None]:
df["V1"].fillna(df["V1"].mean())

In [None]:
df["V1"].fillna(0)

In [None]:
df.apply(lambda x: x.fillna(x.mean()), axis = 0)

## Eksik veriyi saptamak

In [None]:
import numpy as np
import pandas as pd

V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.notnull().sum()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df.isnull()

In [None]:
df[df.isnull().any(axis = 1)]

In [None]:
df[df.notnull().all(axis = 1)]

In [None]:
df[df["V1"].notnull() & df["V2"].notnull() & df["V3"].notnull()]

## Görselleştirme

In [None]:
!pip install missingno

In [None]:
import missingno as msno

In [None]:
df.head()

In [None]:
msno.bar(df);

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
import seaborn as sns
sns.heatmap(df.isnull(), cbar = False);

In [None]:
msno.matrix(df)

In [None]:
df = sns.load_dataset("planets").copy()
df.head()

In [None]:
import seaborn as sns
sns.heatmap(df.isnull(), cbar = False);

In [None]:
msno.matrix(df);

In [None]:
msno.heatmap(df);

In [None]:
null_pattern = (np.random.random(1000).reshape((50, 20)) > 0.5).astype(bool)

null_pattern = pd.DataFrame(null_pattern).replace({False: None})

msno.matrix(null_pattern.set_index(pd.period_range('1/1/2011', '2/1/2015', freq='M')) , freq='BQ');

## Silme Yöntemleri

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.dropna()

In [None]:
df.dropna(how = "all")

In [None]:
df.dropna(axis = 1)

In [None]:
df["V1"][[3,6]] = 99

In [None]:
df.dropna(axis = 1)

In [None]:
df.dropna(axis = 1, how = "all")

In [None]:
df["sil_beni"] = np.nan

In [None]:
df

In [None]:
df.dropna(axis = 1, how = "all", inplace = True)

In [None]:
df

## Basit Değer Atama Yöntemleri

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df["V1"].fillna(0)

In [None]:
df["V1"].fillna(df["V1"].mean())

In [None]:
df.apply(lambda x: x.fillna(x.mean()), axis = 0 )

In [None]:
df.fillna(df.mean()[:])

In [None]:
df.fillna(df.mean()["V1":"V2"])
df.fillna(df.median()["V3"])

In [None]:
df.where(pd.notna(df), df.mean(), axis = "columns")

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
V4 = np.array(["IT","IT","IK","IK","IK","IK","IK","IT","IT"])

df = pd.DataFrame(
        {"maas" : V1,
         "V2" : V2,
         "V3" : V3,
        "departman" : V4}        
)

df

In [None]:
df.groupby("departman")["maas"].mean()

In [None]:
df["maas"].fillna(df.groupby("departman")["maas"].transform("mean"))

## Kategorik Değişkenlerde Değer Atama

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
V4 = np.array(["IT",np.NaN,"IK","IK","IK","IK","IK","IT","IT"])

df = pd.DataFrame(
        {"maas" : V1,
         "V2" : V2,
         "V3" : V3,
        "departman" : V4}        
)

df

In [None]:
df.isnull()

In [None]:
df.groupby("departman")["departman"].count()

In [None]:
df.departman.loc[df.departman == "nan"] = "IK"

In [None]:
df

In [None]:
df.departman[0] = df.V3[0]

In [None]:
df

In [None]:
df.groupby("departman")["departman"].count()

In [None]:
df.departman.fillna(df["departman"].mode())

## ZamanSerilerinde Atama İşlemleri

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
V4 = np.array(["IT","IT","IK","IK","IK","IK","IK","IT","IT"])

df = pd.DataFrame(
        {"maas" : V1,
         "V2" : V2,
         "V3" : V3,
        "departman" : V4}        
)

df

In [None]:
df["maas"].interpolate()

In [None]:
df["maas"].fillna(method = "bfill")

## Tahmine Dayalı Değer Atama Yöntemleri

In [None]:
import seaborn as sns
df = sns.load_dataset('planets').copy()
df = df.select_dtypes(include = ['float64', 'int64'])
print(df.isnull().sum())
msno.matrix(df);

In [None]:
#!pip install fancyimpute

In [None]:
from fancyimpute import KNN

In [None]:
import pandas as pd

In [None]:
var_names = list(df)

## KNN

In [None]:
knn_imp = KNN(k = 5).fit_transform(df);

In [None]:
knn_imp[0:1]

In [None]:
dff = pd.DataFrame(knn_imp)

In [None]:
dff.head()

In [None]:
dff.columns = var_names

In [None]:
dff.head()

In [None]:
dff.isnull().sum()

In [None]:
!pip install ycimpute

In [None]:
from ycimpute.imputer import knnimput

In [None]:
var_names = list(df)

In [None]:
n_df = np.array(df)

In [None]:
n_df.shape

In [None]:
dff = knnimput.KNN(k=4).complete(n_df)

In [None]:
dff = pd.DataFrame(dff, columns = var_names)

In [None]:
dff.head()

In [None]:
dff.isnull().sum()

## Random Forests ile Atama

In [None]:
import seaborn as sns
df = sns.load_dataset('planets').copy()
df = df.select_dtypes(include = ['float64', 'int64'])
print(df.isnull().sum())
msno.matrix(df);

In [None]:
from ycimpute.imputer import iterforest

In [None]:
var_names = list(df)

In [None]:
n_df = np.array(df)

In [None]:
dff = iterforest.IterImput().complete(n_df)

In [None]:
dff = pd.DataFrame(dff, columns = var_names)

In [None]:
dff.isnull().sum()

## EM ile Atama

In [None]:
df.head()

In [None]:
from ycimpute.imputer import EM

In [None]:
var_names = list(df)

In [None]:
n_df = np.array(df)

In [None]:
dff = EM().complete(n_df)

In [None]:
dff = pd.DataFrame(dff, columns = var_names)

In [None]:
dff.isnull().sum()

#  Veri Standardizasyonu & Değişken Dönüşümü

## Standartlaştırma

In [None]:
import numpy as np
import pandas as pd

V1 = np.array([1,3,6,5,7])
V2 = np.array([7,7,5,8,12])
V3 = np.array([6,12,5,6,14])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)



df = df.astype(float)
df

In [None]:
from sklearn import preprocessing

In [None]:
preprocessing.scale(df)

## Normalizasyon

In [None]:
preprocessing.normalize(df)

## Min-Max Dönüşümü

In [None]:
scaler = preprocessing.MinMaxScaler(feature_range = (10,20))

In [None]:
scaler.fit_transform(df)

## Binarize Dönüşüm

In [None]:
binarizer = preprocessing.Binarizer(threshold = 5).fit(df)

In [None]:
binarizer.transform(df)

## 0-1 Dönüşümü

In [None]:
import seaborn as sns
tips = sns.load_dataset('tips')
df = tips.copy()
df_l = df.copy()

In [None]:
df_l.head()

In [None]:
df_l["yeni_sex"] = df_l["sex"].cat.codes

In [None]:
df_l.head()

In [None]:
lbe = preprocessing.LabelEncoder()

In [None]:
df_l["daha_yeni_sex"] = lbe.fit_transform(df_l["sex"])

In [None]:
df_l.head()

## "1 ve Diğerleri (0) " Dönüşümü

In [None]:
df.head()

In [None]:
df_l.head()

In [None]:
df_l["yen_gun"] = np.where(df_l["day"].str.contains("Sun"),1,0)

In [None]:
df_l.head(20)

## Çok Sınıflı Dönüşüm

In [None]:
lbe = preprocessing.LabelEncoder()

In [None]:
df_l["daha_yeni_gun"] = lbe.fit_transform(df_l["day"])

In [None]:
df_l

## One-Hot Dönüşümü ve Dummy Değişken Tuzayı

In [None]:
df_one_hot = df.copy()

In [None]:
pd.get_dummies(df_one_hot, columns = ["sex"], prefix = ["sex"]).head()

In [None]:
pd.get_dummies(df_one_hot, columns = ["day"], prefix = ["day"]).head()

## Sürekli Değişkeni Kategorik Değişkene Çevirme

In [None]:
df.head()

In [None]:
dff = df.select_dtypes(include = ["float64", "int64"])

In [None]:
est = preprocessing.KBinsDiscretizer(n_bins = [3,2,2], encode = "ordinal", strategy = "quantile").fit(dff)

In [None]:
est.transform(dff)[0:10]

## Değişkeni İndexe, İndexi Değişkene Çevirmek

In [None]:
df.head()

In [None]:
df["yeni_degisken"]  = df.index

In [None]:
df["yeni_degisken"] = df["yeni_degisken"] + 10

In [None]:
df.head()

In [None]:
df.index = df["yeni_degisken"]

In [None]:
df.index