# Pandas introduction

### Series

In [None]:
import numpy as np
import pandas as pd
pd.__version__

'1.3.5'

In [None]:
samples = [i**2/5 for i in range(6)]
samples

[0.0, 0.2, 0.8, 1.8, 3.2, 5.0]

In [None]:
s1 = pd.Series(samples)
s1

0    0.0
1    0.2
2    0.8
3    1.8
4    3.2
5    5.0
dtype: float64

In [None]:
# sprawdzenie typu danych (dopasowywany automatycznie)
s1.dtypes

dtype('float64')

In [None]:
type(s1)

pandas.core.series.Series

In [None]:
alphabet = ['a', 'b', 'c', 'd', 'e', 'f']
alphabet

['a', 'b', 'c', 'd', 'e', 'f']

In [None]:
s2 = pd.Series(data=samples, index=alphabet, name="Series_with_custom_indexes")
s2

a    0.0
b    0.2
c    0.8
d    1.8
e    3.2
f    5.0
Name: Series_with_custom_indexes, dtype: float64

In [None]:
# Wartość nieokreślona - nie jest brana pod uwgagę przy obliczaniu średniej itp.
np.nan

nan

In [None]:
samples[2] = np.nan
samples[4] = np.nan
s3 = pd.Series(data=samples, index=alphabet, name="Series_with_NaN")
s3

a    0.0
b    0.2
c    NaN
d    1.8
e    NaN
f    5.0
Name: Series_with_NaN, dtype: float64

In [None]:
# generowanie zbioru dat
bunch_of_dates = pd.date_range(start='20220221', periods=6)
bunch_of_dates

DatetimeIndex(['2022-02-21', '2022-02-22', '2022-02-23', '2022-02-24',
               '2022-02-25', '2022-02-26'],
              dtype='datetime64[ns]', freq='D')

In [None]:
s4 = pd.Series(data=samples, index=bunch_of_dates, name="Series_with_date_as_index")
s4

2022-02-21    0.0
2022-02-22    0.2
2022-02-23    NaN
2022-02-24    1.8
2022-02-25    NaN
2022-02-26    5.0
Freq: D, Name: Series_with_date_as_index, dtype: float64

In [None]:
s4.index

DatetimeIndex(['2022-02-21', '2022-02-22', '2022-02-23', '2022-02-24',
               '2022-02-25', '2022-02-26'],
              dtype='datetime64[ns]', freq='D')

In [None]:
s4.values

array([0. , 0.2, nan, 1.8, nan, 5. ])

In [None]:
s4.shape

(6,)

In [None]:
s5 = pd.Series({"ala" : 16, "ola" : 22, "ula" : 19, "kula" : 314},  name="Series_from_dictionary")
s5

ala      16
ola      22
ula      19
kula    314
Name: Series_from_dictionary, dtype: int64

In [None]:
s5["ola"]

22

In [None]:
s5[1]

22

In [None]:
print("s5 sum : ", s5.sum())
print("s5 max : ", s5.max())
print("s5 min : ", s5.min())
print("s5 std : ", s5.std())

s5 sum :  371
s5 max :  314
s5 min :  16
s5 std :  147.52033758095865


In [None]:
s5.describe()
# XX% - percentyl

count      4.000000
mean      92.750000
std      147.520338
min       16.000000
25%       18.250000
50%       20.500000
75%       95.000000
max      314.000000
Name: Series_from_dictionary, dtype: float64

In [None]:
s6 = s5.append(pd.Series({"wiola":21}))
s6

ala       16
ola       22
ula       19
kula     314
wiola     21
dtype: int64

In [None]:
s6.sort_values()

ala       16
ula       19
wiola     21
ola       22
kula     314
dtype: int64

In [None]:
s6.rank()

ala      1.0
ola      4.0
ula      2.0
kula     5.0
wiola    3.0
dtype: float64

In [None]:
s6.sort_values(ascending=False)

kula     314
ola       22
wiola     21
ula       19
ala       16
dtype: int64

In [None]:
s6.nlargest(3)

kula     314
ola       22
wiola     21
dtype: int64

In [None]:
s6.nsmallest(2)

ala    16
ula    19
dtype: int64

In [None]:
def year_to_days(years):
  additional_days = years//4
  return 365*years + additional_days

s7 = s6.apply(year_to_days)
s7

ala        5844
ola        8035
ula        6939
kula     114688
wiola      7670
dtype: int64

In [None]:
s8 = s6.apply(lambda x:x*12)
s8

ala       192
ola       264
ula       228
kula     3768
wiola     252
dtype: int64

### DataFrame

In [None]:
df1 = pd.DataFrame(data=[1, 2, 3, 4])
df1

Unnamed: 0,0
0,1
1,2
2,3
3,4


In [None]:
df2 = pd.DataFrame(data=[1, 2, 3, 4], index=['a','b','c','d'], columns=['col_1'])
df2

Unnamed: 0,col_1
a,1
b,2
c,3
d,4


In [None]:
df3 = pd.DataFrame(data={"Kot" : [4, True],
                         "Pingwin" : [2, False]})
df3

Unnamed: 0,Kot,Pingwin
0,4,2
1,True,False


In [None]:
cars = pd.DataFrame(data=[[1999 ,2003, 2012],[256999, 421329, 120000],[True, True, False]],
                    index=["year", "milage", "no_accident"], 
                    columns=["seat_leon", "opel_vectra", "vw_golf"] )
cars

Unnamed: 0,seat_leon,opel_vectra,vw_golf
year,1999,2003,2012
milage,256999,421329,120000
no_accident,True,True,False


In [None]:
cars.vw_golf

year             2012
milage         120000
no_accident     False
Name: vw_golf, dtype: object

In [None]:
# wybór jako pd.Series
cars["vw_golf"]

year             2012
milage         120000
no_accident     False
Name: vw_golf, dtype: object

In [None]:
# wybór jako pd.DataFrame
cars[["vw_golf"]]

Unnamed: 0,vw_golf
year,2012
milage,120000
no_accident,False


In [None]:
cars.index

Index(['year', 'milage', 'no_accident'], dtype='object')

In [None]:
cars.values

array([[1999, 2003, 2012],
       [256999, 421329, 120000],
       [True, True, False]], dtype=object)

In [None]:
cars.columns

Index(['seat_leon', 'opel_vectra', 'vw_golf'], dtype='object')

In [None]:
cars.columns = ['seat_ibiza', 'opel_vectra', 'vw_golf']
cars

Unnamed: 0,seat_ibiza,opel_vectra,vw_golf
year,1999,2003,2012
milage,256999,421329,120000
no_accident,True,True,False


In [None]:
# dodawanie nowej kolumny
cars["ford_mustang"] = [2023, 22309, True]
cars

Unnamed: 0,seat_ibiza,opel_vectra,vw_golf,ford_mustang
year,1999,2003,2012,2023
milage,256999,421329,120000,22309
no_accident,True,True,False,True


In [None]:
cars.loc["year"]

seat_ibiza      1999
opel_vectra     2003
vw_golf         2012
ford_mustang    2023
Name: year, dtype: object

In [None]:
cars.loc["year", "seat_ibiza"]

1999

In [None]:
cars.loc[:,"seat_ibiza"]

year             1999
milage         256999
no_accident      True
Name: seat_ibiza, dtype: object

In [None]:
# jak df.loc, ale po indeksach numerycznych
cars.iloc[1]

seat_ibiza      256999
opel_vectra     421329
vw_golf         120000
ford_mustang     22309
Name: milage, dtype: object

In [None]:
cars.iloc[:, 0]

year             1999
milage         256999
no_accident      True
Name: seat_ibiza, dtype: object

In [None]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, year to no_accident
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   seat_ibiza    3 non-null      object
 1   opel_vectra   3 non-null      object
 2   vw_golf       3 non-null      object
 3   ford_mustang  3 non-null      object
dtypes: object(4)
memory usage: 228.0+ bytes


In [None]:
cars.describe()

Unnamed: 0,seat_ibiza,opel_vectra,vw_golf,ford_mustang
count,3,3,3,3
unique,3,3,3,3
top,1999,2003,2012,2023
freq,1,1,1,1


In [None]:
cars.describe().T

Unnamed: 0,count,unique,top,freq
seat_ibiza,3,3,1999,1
opel_vectra,3,3,2003,1
vw_golf,3,3,2012,1
ford_mustang,3,3,2023,1
