# Pandas Serisi Olusturmak

In [1]:
import pandas as pd

In [2]:
pd.Series([10,88,3,4,5])

0    10
1    88
2     3
3     4
4     5
dtype: int64

**serini ifadelerei ve index degerleri bir arada verilmistir numpy dan ayiran en buyuk ozelliklerden biri**

In [5]:
seri = pd.Series([10,88,3,4,5])

In [6]:
type(seri)

pandas.core.series.Series

In [7]:
seri.axes

[RangeIndex(start=0, stop=5, step=1)]

In [8]:
seri.dtype

dtype('int64')

In [9]:
seri.size

5

In [10]:
seri.ndim

1

In [11]:
seri.values

array([10, 88,  3,  4,  5], dtype=int64)

In [12]:
seri.head()

0    10
1    88
2     3
3     4
4     5
dtype: int64

In [13]:
seri.head(3)

0    10
1    88
2     3
dtype: int64

In [14]:
seri.tail(3)

2    3
3    4
4    5
dtype: int64

In [15]:
#index isimlendirilmesi

In [16]:
pd.Series([99,22,345,94,5])

0     99
1     22
2    345
3     94
4      5
dtype: int64

In [17]:
pd.Series([99,22,345,94,5], index =[1,3,5,7,9])

1     99
3     22
5    345
7     94
9      5
dtype: int64

In [18]:
seri = pd.Series([99,22,345,94,5], index = ['a','b','c','d','e'])

In [19]:

seri['a']

99

In [20]:
seri['a':'c']

a     99
b     22
c    345
dtype: int64

**a dan c ye kadar git demis oluyoruz**

In [21]:
#sozluk uzerinden liste olusturmak

In [22]:
sozluk = pd.Series({'reg':10, 'log':11, 'cart': 12})

In [23]:
sozluk

reg     10
log     11
cart    12
dtype: int64

**ayni islem soylede yapilabilir**

In [24]:
sozluk = {'reg':10, 'log':11, 'cart': 12}

In [25]:
seri = pd.Series(sozluk)

In [26]:
seri

reg     10
log     11
cart    12
dtype: int64

In [27]:
#iki seriyibirlestirerek seri olusturma

In [28]:
pd.concat([seri,seri])

reg     10
log     11
cart    12
reg     10
log     11
cart    12
dtype: int64

**Eleman Islemleri**

In [29]:
import numpy as np
a = np.array([1,2,33,444,75])
seri = pd.Series(a)
seri

0      1
1      2
2     33
3    444
4     75
dtype: int32

In [30]:
seri[0]

1

In [31]:
seri[0:3]

0     1
1     2
2    33
dtype: int32

In [32]:
seri = pd.Series([121,200,150,99],
                 index = ['reg','loj','cart','rf'])

In [33]:
seri

reg     121
loj     200
cart    150
rf       99
dtype: int64

In [34]:
seri.index

Index(['reg', 'loj', 'cart', 'rf'], dtype='object')

In [35]:
seri.keys

<bound method Series.keys of reg     121
loj     200
cart    150
rf       99
dtype: int64>

In [36]:
list(seri.items())

[('reg', 121), ('loj', 200), ('cart', 150), ('rf', 99)]

In [37]:
seri.values

array([121, 200, 150,  99], dtype=int64)

In [38]:
 #eleman sorgulama

In [39]:
"reg" in seri

True

In [40]:
'a' in seri

False

In [41]:
#fancy eleman

In [42]:
seri[['rf','reg']]

rf      99
reg    121
dtype: int64

In [43]:
seri['reg'] = 130

In [44]:
seri['reg']

130

In [45]:
seri['reg':'loj']

reg    130
loj    200
dtype: int64

**Pandas DataFrame Olusturma**

In [46]:
import pandas as pd

In [47]:
l = [1,2,39,67,90]

In [48]:
 pd.DataFrame(l, columns = ['degisken_ismi'])

Unnamed: 0,degisken_ismi
0,1
1,2
2,39
3,67
4,90


In [49]:
import numpy as np
m = np.arange(1,10).reshape((3,3))
m

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [50]:
  pd.DataFrame(m, columns =['var1', 'var2', 'var3'])

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [51]:
#df isimlendirme

In [52]:
df = pd.DataFrame(m, columns =['var1', 'var2', 'var3'])
df.head()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [53]:
df

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [54]:
type(df)

pandas.core.frame.DataFrame

In [55]:
df.axes

[RangeIndex(start=0, stop=3, step=1),
 Index(['var1', 'var2', 'var3'], dtype='object')]

**axes bize satir ve sutun bilgilerini vermis oluyor**

In [56]:
df.shape

(3, 3)

**bize boyut bilgisini verdi**

In [57]:
df.ndim

2

**pandas iki boyutlu oildugundan dolayi nezaman sorgulasak bize iki  boyut bilgisini verecektir**

In [58]:
df.size

9

**bunu yazincada bize eleman sayinsi vermis oldu**

In [59]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [60]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

**sadece bunu degerlerin degiskenlerine erismek isersek bunu aldi bi numpy array formatina cevirdi**

In [61]:
type(df.values)

numpy.ndarray

In [62]:
df.head()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


In [63]:
df.tail()

Unnamed: 0,var1,var2,var3
0,1,2,3
1,4,5,6
2,7,8,9


**bastan ve sondan gozlemlere erismek istedigimizde bu ifaderleri kullaniyoruz**

In [64]:
a = np.array([1,2,3,4,5]) 

In [65]:
pd.DataFrame(a,columns = ['deg1'])

Unnamed: 0,deg1
0,1
1,2
2,3
3,4
4,5


**Eleman Islemleri**

In [66]:
import numpy as np
s1 = np.random.randint(10, size = 5)
s2 = np.random.randint(10, size = 5)
s3 = np.random.randint(10, size = 5)

In [67]:
sozluk = {'var1': s1, 'var2': s2, 'var3': s3}

In [68]:
sozluk

{'var1': array([2, 2, 6, 9, 9]),
 'var2': array([9, 6, 2, 3, 3]),
 'var3': array([7, 4, 9, 7, 0])}

**degerlendirme olustu sol taraftakiler "var" key degerleri, sag tarafata ise genel degertleri goruyorduk ama bunlar numpy array oldugu icin boylr oldu**

In [69]:
df = pd.DataFrame(sozluk)

In [70]:
df

Unnamed: 0,var1,var2,var3
0,2,9,7
1,2,6,4
2,6,2,9
3,9,3,7
4,9,3,0


In [71]:
df[0:1]

Unnamed: 0,var1,var2,var3
0,2,9,7


In [72]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [73]:
df.index = ["a","b","c","d","e"]

In [74]:
df

Unnamed: 0,var1,var2,var3
a,2,9,7
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [75]:
df['c':'e']

Unnamed: 0,var1,var2,var3
c,6,2,9
d,9,3,7
e,9,3,0


In [76]:
#silme


In [77]:
df.drop('a', axis = 0)

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


**silme islemi bu sekilde gerceklesiyor**

In [78]:
df

Unnamed: 0,var1,var2,var3
a,2,9,7
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


**gordumuz gibi veri setinin ana yapisi uzerinde bi degisiklik olmadi**

In [79]:
df.drop("a", axis = 0, inplace = True)

In [80]:
df

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


**ancak inplace formatinda yazarsak kalici olarak kaldirmis oluyoruz**

In [81]:
#fancy

In [82]:
l = ["c","e"]

In [83]:
df.drop(l, axis = 0)

Unnamed: 0,var1,var2,var3
b,2,6,4
d,9,3,7


**fancy ile birlikte ikisini ayni anda silmis olduk**

In [84]:
#degiskenler icin

In [85]:
df

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [86]:
"var1" in df

True

In [87]:
"var0" in df

False

In [88]:
"var0234234234234234" in df

False

In [89]:
l = ['var1',' var2','var3']

In [90]:
for i in l:
    print(i in df)

True
False
True


**index bazinda bi sorgulama yapmistik, simdi ise sutun bazinda bi sorgulama yaptik ve 'var4' isimli birsey olmadigindan false dedi**

In [92]:
df

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [93]:
df['var1']

b    2
c    6
d    9
e    9
Name: var1, dtype: int32

In [94]:
df['var4'] = df ['var1'] / df['var2']

In [95]:
df

Unnamed: 0,var1,var2,var3,var4
b,2,6,4,0.333333
c,6,2,9,3.0
d,9,3,7,3.0
e,9,3,0,3.0


In [96]:
#Degisken Silmek

In [97]:
df.drop('var4', axis = 1)

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [98]:
df

Unnamed: 0,var1,var2,var3,var4
b,2,6,4,0.333333
c,6,2,9,3.0
d,9,3,7,3.0
e,9,3,0,3.0


In [99]:
df.drop('var4' , axis = 1, inplace = True)

In [100]:
df

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [101]:
l = ['var1','var2']

In [102]:
df.drop(l, axis =1)

Unnamed: 0,var3
b,4
c,9
d,7
e,0


In [103]:
df

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [104]:
l =['var1']

In [105]:
df

Unnamed: 0,var1,var2,var3
b,2,6,4
c,6,2,9
d,9,3,7
e,9,3,0


In [106]:
df.drop(l, axis = 1)

Unnamed: 0,var2,var3
b,6,4
c,2,9
d,3,7
e,3,0


**ornekte goruldugu gibi df.drop(l, axis = 1) yazilarak siliniyor**

# Gozlem ve Degisken Secimi: loc&iloc

In [108]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size =(10,3))
df = pd.DataFrame(m, columns = ['var1','var2','var3'])
df

Unnamed: 0,var1,var2,var3
0,16,9,16
1,5,6,8
2,26,2,21
3,12,16,24
4,19,20,23
5,11,3,8
6,9,19,4
7,24,9,24
8,24,21,6
9,6,23,6


**pandas da yapilan en cok hatalarin basinda gozlem ve degiskenler gelir bunu 
sebebi bunun indexler anlamindaki liste ve numpy da alisik olmus seklin biraz daha disinda
biraz daha farkli kullaniyor olmasi**

In [111]:
#loc: tanimlandigi sekli ile secim yapmak icin kullanilir

In [112]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,16,9,16
1,5,6,8
2,26,2,21
3,12,16,24


**indexlemeye gore bize secim imkani veriyor**

In [114]:
#iloc: alisik oldugumuuz indeksleme mantigi ile secim yapar.

In [115]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,16,9,16
1,5,6,8
2,26,2,21


**isle burada loc ile iloc un arasindaki farki goruyoruz***

In [117]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,16,9,16
1,5,6,8
2,26,2,21


In [118]:
df.iloc[0,0]

16

In [123]:
df.iloc[:3, :2]

Unnamed: 0,var1,var2
0,16,9
1,5,6
2,26,2


In [125]:
df

Unnamed: 0,var1,var2,var3
0,16,9,16
1,5,6,8
2,26,2,21
3,12,16,24
4,19,20,23
5,11,3,8
6,9,19,4
7,24,9,24
8,24,21,6
9,6,23,6


In [124]:
df.loc[0:3,'var3']

0    16
1     8
2    21
3    24
Name: var3, dtype: int32

**loc ile boyle yaparak 0 dahil 3 kadar geldi ve sadece 'var3' u cagirdigimiz icin sadece var 3 gorduk**

**bura ya direk "df.iloc[0:3,'var3']" yazarsak hata aliriz
iloc ile loc u ayirt etmek icin, eger degisken ya da gozlem birimleri satirlarla ilgili
birsey yapicasak loc'u kullanmaiz gerek. eger ben bulara takilmiyorum indexli bi secim 
yapmak istiyorum derseniz iloc u kullana bilirsiniz**


In [None]:
**ama illaki ben index i iloc a gore gorucem derseniz df.loc[0:3,'var3'], iloc da boyle yazilir
df.iloc[0:3] ['var3'] boyle yazilir**

In [128]:
df.iloc[0:3] ['var3']

0    16
1     8
2    21
Name: var3, dtype: int32

# Kosullu Eleman Islemleri

In [130]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size =(10,3))
df = pd.DataFrame(m, columns = ['var1','var2','var3'])
df

Unnamed: 0,var1,var2,var3
0,27,28,2
1,15,2,17
2,27,2,6
3,22,16,29
4,4,9,15
5,19,17,25
6,5,8,8
7,15,14,10
8,13,10,5
9,3,15,15


In [134]:
df['var1']

0    27
1    15
2    27
3    22
4     4
5    19
6     5
7    15
8    13
9     3
Name: var1, dtype: int32

In [135]:
df[0:2]

Unnamed: 0,var1,var2,var3
0,27,28,2
1,15,2,17


In [136]:
df ['var1'] [0:2]

0    27
1    15
Name: var1, dtype: int32

**bu sekilde degisken secip cagirabiliyoruz**

In [138]:
df.var1

0    27
1    15
2    27
3    22
4     4
5    19
6     5
7    15
8    13
9     3
Name: var1, dtype: int32

In [141]:
df[df.var1 > 15]             #DataFrame icerisinde 15 den buyuk olan degerleri filtreleme islemi yaptik

Unnamed: 0,var1,var2,var3
0,27,28,2
2,27,2,6
3,22,16,29
5,19,17,25


In [142]:
df[df.var1 > 15] ['var2']  #Burada sadece var2 nin icinde 15 den buyukleri filtreledik

0    28
2     2
3    16
5    17
Name: var2, dtype: int32

In [144]:
df[(df.var1 > 15) & (df.var3 < 5)]      #Burada gozuktugu uzere birden fazla filtreleme islemi yaptik

Unnamed: 0,var1,var2,var3
0,27,28,2


In [150]:
df.loc[(df.var1 > 15), ['var1','var2']] 

Unnamed: 0,var1,var2
0,27,28
2,27,2
3,22,16
5,19,17


# Birlestirme (join) Islemleri

In [234]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size =(5,3))
df1 = pd.DataFrame(m, columns = ['var1','var2','var3'])
df1

Unnamed: 0,var1,var2,var3
0,10,5,3
1,17,11,19
2,25,11,12
3,1,12,15
4,28,16,26


**yukaridaki data frame1 assaghiya yenisin create ediyoruz**

In [235]:
df2 = df1 + 99

In [236]:
df2

Unnamed: 0,var1,var2,var3
0,109,104,102
1,116,110,118
2,124,110,111
3,100,111,114
4,127,115,125


In [237]:
pd.concat([df1,df2])     #    Birlestirmek icin pd.concat yaziyoruz ama bi sorun oluyor
                         #   0 dan 4 kadar gidiyor sonra yine 0 dan 4 gidiyor

Unnamed: 0,var1,var2,var3
0,10,5,3
1,17,11,19
2,25,11,12
3,1,12,15
4,28,16,26
0,109,104,102
1,116,110,118
2,124,110,111
3,100,111,114
4,127,115,125


In [238]:
pd.concat([df1,df2])

Unnamed: 0,var1,var2,var3
0,10,5,3
1,17,11,19
2,25,11,12
3,1,12,15
4,28,16,26
0,109,104,102
1,116,110,118
2,124,110,111
3,100,111,114
4,127,115,125


In [239]:
pd.concat([df1,df2], ignore_index=False)  # Yapilincada problem devam ediyor ama 'false' yerine 'true' yazarsak sorun olmuyor


Unnamed: 0,var1,var2,var3
0,10,5,3
1,17,11,19
2,25,11,12
3,1,12,15
4,28,16,26
0,109,104,102
1,116,110,118
2,124,110,111
3,100,111,114
4,127,115,125


In [240]:
pd.concat([df1,df2], ignore_index=True)

Unnamed: 0,var1,var2,var3
0,10,5,3
1,17,11,19
2,25,11,12
3,1,12,15
4,28,16,26
5,109,104,102
6,116,110,118
7,124,110,111
8,100,111,114
9,127,115,125


In [241]:
df1.columns                  # DataFrame lere boyle ulasiyoruz

Index(['var1', 'var2', 'var3'], dtype='object')

In [242]:
df2.columns = ['var1','var2','deg3']

In [243]:
df2

Unnamed: 0,var1,var2,deg3
0,109,104,102
1,116,110,118
2,124,110,111
3,100,111,114
4,127,115,125


In [244]:
df1

Unnamed: 0,var1,var2,var3
0,10,5,3
1,17,11,19
2,25,11,12
3,1,12,15
4,28,16,26


In [245]:
pd.concat([df1, df2])

Unnamed: 0,var1,var2,var3,deg3
0,10,5,3.0,
1,17,11,19.0,
2,25,11,12.0,
3,1,12,15.0,
4,28,16,26.0,
0,109,104,,102.0
1,116,110,,118.0
2,124,110,,111.0
3,100,111,,114.0
4,127,115,,125.0


In [246]:
pd.concat([df1, df2], join ='inner')  # Burada islemleri biirlestirmis oluyoruz, neye gore birlestirmis oluyoruz kesimleri birlesti

Unnamed: 0,var1,var2
0,10,5
1,17,11
2,25,11
3,1,12
4,28,16
0,109,104
1,116,110
2,124,110
3,100,111
4,127,115


In [258]:
pd.concat([df1, df2] , names = [df1.columns] )

Unnamed: 0,var1,var2,var3,deg3
0,10,5,3.0,
1,17,11,19.0,
2,25,11,12.0,
3,1,12,15.0,
4,28,16,26.0,
0,109,104,,102.0
1,116,110,,118.0
2,124,110,,111.0
3,100,111,,114.0
4,127,115,,125.0


In [224]:
# Ileri Birlestirme Islemleri

In [269]:
import pandas as pd

In [270]:
#birebir birlestirme

In [271]:
df1 = pd.DataFrame({'calisanlar': ['Ali','Veli', 'Ayse','Fatma'],
                    'grup':['Muhasebe', 'Muhendislik', 'Muhendislik', 'iK']})
df1

Unnamed: 0,calisanlar,grup
0,Ali,Muhasebe
1,Veli,Muhendislik
2,Ayse,Muhendislik
3,Fatma,iK


In [272]:
df2 = pd.DataFrame({'calisanlar': ['Ali','Veli', 'Ayse','Fatma'],
                    'ilk_giris':['2010', '2009','2014', '2019']})
df2

Unnamed: 0,calisanlar,ilk_giris
0,Ali,2010
1,Veli,2009
2,Ayse,2014
3,Fatma,2019


In [275]:
pd.merge(df1, df2)

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2010
1,Veli,Muhendislik,2009
2,Ayse,Muhendislik,2014
3,Fatma,iK,2019


In [276]:
pd.merge(df1, df2, on = 'calisanlar')

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2010
1,Veli,Muhendislik,2009
2,Ayse,Muhendislik,2014
3,Fatma,iK,2019


In [277]:
#coktan teke

In [282]:
df3 = pd.merge(df1,df2)

In [283]:
df3

Unnamed: 0,calisanlar,grup,ilk_giris
0,Ali,Muhasebe,2010
1,Veli,Muhendislik,2009
2,Ayse,Muhendislik,2014
3,Fatma,iK,2019


In [286]:
df4 = pd.DataFrame({'grup':['Muhasebe', 'Muhendislik', 'iK'],
                    'mudur':['caner', 'Musta', 'berkecan',]})
df4

Unnamed: 0,grup,mudur
0,Muhasebe,caner
1,Muhendislik,Musta
2,iK,berkecan


In [287]:
pd.merge(df3, df4)

Unnamed: 0,calisanlar,grup,ilk_giris,mudur
0,Ali,Muhasebe,2010,caner
1,Veli,Muhendislik,2009,Musta
2,Ayse,Muhendislik,2014,Musta
3,Fatma,iK,2019,berkecan


In [288]:
# coktan coka

In [290]:
df5 = pd.DataFrame({'grup' : ['Muhasebe','Muhasebe',
                              'Muhendislik','Muhendislik','iK','iK'],
                    'yetenekler':['matematik', 'excel','kodlama','linux','excel','yonetim']})
df5

Unnamed: 0,grup,yetenekler
0,Muhasebe,matematik
1,Muhasebe,excel
2,Muhendislik,kodlama
3,Muhendislik,linux
4,iK,excel
5,iK,yonetim


In [291]:
df1

Unnamed: 0,calisanlar,grup
0,Ali,Muhasebe
1,Veli,Muhendislik
2,Ayse,Muhendislik
3,Fatma,iK


In [292]:
pd.merge(df1, df5)                # gordugunuz gibi df1 ile df5 cokladi ve buradaki coklama bir sorun degil

Unnamed: 0,calisanlar,grup,yetenekler
0,Ali,Muhasebe,matematik
1,Ali,Muhasebe,excel
2,Veli,Muhendislik,kodlama
3,Veli,Muhendislik,linux
4,Ayse,Muhendislik,kodlama
5,Ayse,Muhendislik,linux
6,Fatma,iK,excel
7,Fatma,iK,yonetim


In [None]:
# Toplulastirma ve Gruplastirma (Aggregation & Grouping)
Basit toplulastirma fonksiyonlar
- count()
- first()
- last ()
- mean ()
- median()
- min()
- max()
- std()
- var()
- sum()



In [296]:
import seaborn as sns

In [299]:
df = sns.load_dataset('planets')
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [300]:
df.shape

(1035, 6)

In [302]:
df['mass'].mean()

2.6381605847953233

In [303]:
df['mass'].max()

25.0

In [304]:
df['mass'].std()

3.8186166509616046

In [305]:
df['mass'].var()

14.58183312700122

In [307]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,1035.0,1.785507,1.240976,1.0,1.0,1.0,2.0,7.0
orbital_period,992.0,2002.917596,26014.728304,0.090706,5.44254,39.9795,526.005,730000.0
mass,513.0,2.638161,3.818617,0.0036,0.229,1.26,3.04,25.0
distance,808.0,264.069282,733.116493,1.35,32.56,55.25,178.5,8500.0
year,1035.0,2009.070531,3.972567,1989.0,2007.0,2010.0,2012.0,2014.0


In [308]:
df.dropna().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
number,498.0,1.73494,1.17572,1.0,1.0,1.0,2.0,6.0
orbital_period,498.0,835.778671,1469.128259,1.3283,38.27225,357.0,999.6,17337.5
mass,498.0,2.50932,3.636274,0.0036,0.2125,1.245,2.8675,25.0
distance,498.0,52.068213,46.596041,1.35,24.4975,39.94,59.3325,354.0
year,498.0,2007.37751,4.167284,1989.0,2005.0,2009.0,2011.0,2014.0


# Gruplama Islemleri

In [316]:
df = pd.DataFrame ({'gruplar' : ['A','B','C','A','B','C'],
                    'veri': [10,11,52,23,43,55]}, columns=['gruplar','veri'])
df  

# Mesela a,b.c sinif ve veri de puanlari boyle siralamis oluyoruz

Unnamed: 0,gruplar,veri
0,A,10
1,B,11
2,C,52
3,A,23
4,B,43
5,C,55


In [317]:
df.groupby('gruplar')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E93E77C6A0>

In [318]:
df.groupby('gruplar').mean()      # Bu sekilde siniflarin ortalmasini aliyoruz

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,16.5
B,27.0
C,53.5


In [319]:
df.groupby('gruplar').sum()    # bu sekilde farkli aggregate fonksiyonlarini kullanabiliyoruz

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,33
B,54
C,107


In [320]:
df = sns.load_dataset('planets')
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [321]:
df.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E93E77CF40>

In [331]:
df.groupby('method')['orbital_period'].mean()

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [None]:
**Simdi once "df = sns.load_dataset('planets') df.head()" yazdik ve data setini olusturduk
sonra sunu  yazarak "df.groupby('method')" metot a gore grup yapacagiz bu yuzden
methodu tut dedim sonra bir suru degisken var ve degiskeni yazip onuda tut diyoruz
"['orbital_period']" tamam degisken cepte, neye gore gruplanacagi cepte "mean" yazip ortamasini alabilirz
yani ilk basta yazdigim tum aggrregate yazabiliriz

# Ileri toplulastirma islemleri (Aggregate, filter, transform , apply)

In [359]:
import pandas as pd
df = pd.DataFrame({'gruplar' : ['A', 'B','C','A','B','C'],
                    'degisken1': [10,23,33,22,11,99],
                    'degisken2': [100,253,333,262,111,969]},
                   columns = ['gruplar', 'degisken1','degisken2'])
df                   

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [360]:
df.groupby('gruplar').mean()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16,181
B,17,182
C,66,651


In [361]:
#aggregate

In [363]:
df.groupby('gruplar').mean()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16,181
B,17,182
C,66,651


In [367]:
df.groupby('gruplar').aggregate(['min', np.median, max ])

Unnamed: 0_level_0,degisken1,degisken1,degisken1,degisken2,degisken2,degisken2
Unnamed: 0_level_1,min,median,max,min,median,max
gruplar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,16,22,100,181,262
B,11,17,23,111,182,253
C,33,66,99,333,651,969


In [372]:
df.groupby('gruplar').aggregate({'degisken1': 'min','degisken2': 'max'})


Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,262
B,11,253
C,33,969


# Filter 

In [1]:
import pandas as pd
df = pd.DataFrame({'gruplar' : ['A', 'B','C','A','B','C'],
                    'degisken1': [10,23,33,22,11,99],
                    'degisken2': [100,253,333,262,111,969]},
                   columns = ['gruplar', 'degisken1','degisken2'])
df   

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [2]:
def filter_func(x):
    return x['degisken1'].std()>9

In [3]:
df.groupby('gruplar').std ()   # standart sapma hesapladik, 

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8.485281,114.551299
B,8.485281,100.409163
C,46.669048,449.719913


In [4]:
df.groupby('gruplar').filter(filter_func) 

Unnamed: 0,gruplar,degisken1,degisken2
2,C,33,333
5,C,99,969


In [5]:
#transform  #cok iyi anladigim soylenemez

In [7]:
import pandas as pd
df = pd.DataFrame({'gruplar' : ['A', 'B','C','A','B','C'],
                    'degisken1': [10,23,33,22,11,99],
                    'degisken2': [100,253,333,262,111,969]},
                   columns = ['gruplar', 'degisken1','degisken2'])
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [8]:
df['degisken1']*9

0     90
1    207
2    297
3    198
4     99
5    891
Name: degisken1, dtype: int64

In [21]:
df_a = df.iloc[:,1:3]

In [24]:
df_a.transform(lambda x: x-x.mean())       # transform fonksiyonunu belirledigimiz bir donusturme fonksiyonu olan
                                           # degiskenlerin uzerinde kullanacagimi ifade ettik ama bun dataframe e gonderdigimizde
                                           # yine hata aldik ve iloc yazarak duzelttikk

Unnamed: 0,degisken1,degisken2
0,-23.0,-238.0
1,-10.0,-85.0
2,0.0,-5.0
3,-11.0,-76.0
4,-22.0,-227.0
5,66.0,631.0


In [29]:
df_a.transform(lambda x: (x-x.mean()) / x.std())  

Unnamed: 0,degisken1,degisken2
0,-0.687871,-0.738461
1,-0.299074,-0.263736
2,0.0,-0.015514
3,-0.328982,-0.235811
4,-0.657963,-0.704331
5,1.97389,1.957853


In [30]:
#Apply

In [45]:
import pandas as pd
df = pd.DataFrame({
                    'degisken1': [10,23,33,22,11,99],
                    'degisken2': [100,253,333,262,111,969]},
                   columns = [ 'degisken1','degisken2'])
df

Unnamed: 0,degisken1,degisken2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969


In [47]:
import numpy as np

In [46]:
df.apply(np.sum)

degisken1     198
degisken2    2028
dtype: int64

In [43]:
df.apply(np.sum)

degisken1     198
degisken2    2028
dtype: int64

In [48]:
df.apply(np.mean)

degisken1     33.0
degisken2    338.0
dtype: float64

In [49]:
#Pivot Tablolar

In [51]:
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [52]:
titanic.groupby('sex')['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [54]:
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [55]:
titanic.groupby('sex')[['survived']].aggregate('mean').unstack()

          sex   
survived  female    0.742038
          male      0.188908
dtype: float64

In [57]:
titanic.groupby(['sex','survived']).aggregate('mean').unstack() #ustteki ile farkini gor diye bun  yazdim

Unnamed: 0_level_0,pclass,pclass,age,age,sibsp,sibsp,parch,parch,fare,fare,adult_male,adult_male,alone,alone
survived,0,1,0,1,0,1,0,1,0,1,0,1,0,1
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
female,2.851852,1.918455,25.046875,28.847716,1.209877,0.515021,1.037037,0.515021,23.024385,51.938573,0.0,0.0,0.333333,0.424893
male,2.476496,2.018349,31.618056,27.276022,0.440171,0.385321,0.207265,0.357798,21.960993,40.821484,0.959402,0.807339,0.741453,0.587156


In [58]:
titanic.groupby(['sex','class'])[['survived']].aggregate('mean').unstack()

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [None]:
# ustteki islemde biz group by araciligi ile bir pivot islemi yaptik yan veri setini x ekseninde cinsiyet katogarik degisimi
# ve y ekseninde class katagorik degiminin siniflaria gore bolup kesisimlerine birseyler ifade ettik
# mesela first class da female olanalrin %96 kurtulmus, second classda %92 kurtulmus gibi
# yani pivot ile biz bunlarin kesisimlerini tablolamis olduk, titanicden kurtulanlarin

# titanic.groupby(['sex','class'])[['survived']]   .aggregate('mean')   .unstack() code aslinda boyle cok zor gibi gozukmesin  
 

In [59]:
#pivot ile table

In [60]:
titanic.pivot_table('survived', index ='sex', columns ='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [61]:
titanic.age.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [62]:
age = pd.cut(titanic ['age'], [0.18,90])
age.head(10)

0    (0.18, 90.0]
1    (0.18, 90.0]
2    (0.18, 90.0]
3    (0.18, 90.0]
4    (0.18, 90.0]
5             NaN
6    (0.18, 90.0]
7    (0.18, 90.0]
8    (0.18, 90.0]
9    (0.18, 90.0]
Name: age, dtype: category
Categories (1, interval[float64]): [(0.18, 90.0]]

In [63]:
titanic.pivot_table('survived', ['sex',age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0.18, 90.0]",0.964706,0.918919,0.460784
male,"(0.18, 90.0]",0.39604,0.151515,0.150198


In [None]:
# Fark ettiysen age i tirnak icine almadik, bu sekilde yaprsak mevcut yapisini koruyarak cikarmis oluyor
# sex de tirnak isaretini kaldirisak bu veri setinin icerisinde yer almadigindan dolayi hata aliyoruz farki bu!
