In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.0.5'

## pd.Series

In [3]:
# Create Series

my_series = pd.Series(
    [1, 1, 2, 3, np.NaN, 8, 13], 
    index=['a', 'b', 'b', 'd', 'e', 'f', 7]
)
my_series

a     1.0
b     1.0
b     2.0
d     3.0
e     NaN
f     8.0
7    13.0
dtype: float64

In [7]:
type(my_series)

pandas.core.series.Series

In [None]:
# Create Series

my_second_series = pd.Series(
    [13, 8, np.NaN, 3, 2, 1, 1], 
    index=['a', 'b', 'c', 'd', 'e', 'f', 'g']
)
my_second_series

In [None]:
my_series.index

In [None]:
my_series.values

In [None]:
my_series.shape

In [None]:
my_series['b']

In [None]:
# Get slices by position

my_series.iloc[0:6:2]

In [None]:
# Get value by index

my_series.at['b']

In [None]:
# Get value by position

my_series.iat[6]

In [None]:
# Check whether values are contained in Series

my_series.isin([3, 8])

In [None]:
# Boolean indexing

my_series[my_series > my_series.mean()]

### Operations

In [None]:
%%timeit

# Check summation of 2 series

my_series + my_second_series

In [None]:
%%timeit

# Check summation of 2 series

my_series.add(my_second_series)

In [None]:
# Such kind of summation get NaN for indices that do not overlap 

my_series + my_second_series

In [None]:
# Special functions are faster and more flexible

my_series.add(my_second_series, fill_value=0)

In [None]:
# Check that statistical methods automatically exclude missing data

sum_elements = my_series
count_elements = len(my_series)
print(f"Mean: {my_series.mean()}")
print(f"Sum of elements divided by count of elements: {sum_elements/count_elements}")

In [None]:
# Map function

func_10 = lambda x: x+10
my_series.map(func_10)

# pd.DataFrame

### Intro

In [None]:
df1 = pd.DataFrame({'country': ['Kazakhstan', 'Russia', 'Belarus', 'Ukraine'],
                    'population': [17.04, 143.5, 9.5, 45.5],
                    'square': [2724902, 17125191, 207600, 603628]},
                   index=['KZ', 'RU', 'BY', 'UA']
)
df1

In [None]:
df2 = pd.DataFrame(
    data=np.random.rand(5,5),
    columns=['A', 'B', 'C', 'D', 'F']
)
df2

In [None]:
# Select one column

df1['country']

In [None]:
# Multi-columns selection

df1[['country', 'square']]

In [None]:
# Slices the rows

df1[0:2]

In [None]:
# Select all rows for some columns

df1.loc[:, ['country', 'square']]

In [None]:
%%timeit

# Select scalar

df1.loc['KZ', 'country']

In [None]:
%%timeit

# Select scalar

df1.at['KZ', 'country']

In [None]:
%%timeit

# Select scalar

df1.iloc[0, 0]

In [None]:
%%timeit

# Select scalar

df1.iat[0, 0]

In [None]:
# Select one row

df1.iloc[3]

In [None]:
# Select some rows and some columns

df1.iloc[0:3, 0:2]

### Operations

In [None]:
# The same issues with actions with DataFrames as with Series

df2.loc[0, 'A'] = np.NaN
print(df2['A'] + df2['B'])
print(' ')
print(df2['A'].add(df2['B'], fill_value=0))

In [None]:
# Apply-function is the same with map for Series. Results are not inplaced.

df2.apply(func_10)

In [None]:
%%timeit

df2.apply(lambda x:round(x))

In [None]:
%%timeit

df2.round()

### Analysis

In [None]:
# Read csv-file

df = pd.read_csv('student-mat.csv', sep=';')

In [None]:
# Size of df

df.shape

In [None]:
df.info()

In [None]:
df.describe(include='all')

We can change count of visible columns and rows:

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 20)

In [None]:
df.describe(include='all')

In [None]:
# Select rows where sex is 'F'

df[(df['sex'] == 'F')]

In [None]:
# Select rows where sex is F and Mjob is equal to "at_home"

df[(df['sex'] == 'F') & (df['Mjob'] == 'at_home')]

In [None]:
# Check all variants for Mjob

df['Mjob'].value_counts()

In [None]:
# Select rows where Mjob is "at_home" or "teacher"

interesting_jobs = ['at_home', 'teacher']

df[df['Mjob'].isin(interesting_jobs)]

### Concat

In [None]:
# Create tables for examples

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])


df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                   index=[4, 5, 6, 7])


df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                    'B': ['B8', 'B9', 'B10', 'B11'],
                    'C': ['C8', 'C9', 'C10', 'C11'],
                    'D': ['D8', 'D9', 'D10', 'D11']},               
                    index=[8, 9, 10, 11])

In [None]:
result = pd.concat([df1, df2, df3])

In [None]:
result

In [None]:
# Adding keys for simplifying indexing

result = pd.concat([df1, df2, df3], keys=['x', 'y', 'z'])

In [None]:
result

In [None]:
result.loc['y']

In [None]:
# Creating new frame

df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                    'D': ['D2', 'D3', 'D6', 'D7'],
                    'F': ['F2', 'F3', 'F6', 'F7']},
                   index=[2, 3, 6, 7])

In [None]:
# Check outer join

result = pd.concat([df1, df4], axis=1, join='outer', sort=False)
result

In [None]:
# Check inner join

result = pd.concat([df1, df4], axis=1, join='inner', sort=False)
result

In [None]:
# Check how ignore_index works

result = pd.concat([df1, df4], ignore_index=True, sort=False)
result

### Merge

In [None]:
# Create frames

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})


right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

In [None]:
# Merge on 2 keys with default parameters

result = pd.merge(left, right, on=['key1', 'key2'])
result

In [None]:
# LEFT merge

result = pd.merge(left, right, how='left', on=['key1', 'key2'])
print('Left')
result

In [None]:
# RIGHT merge

result = pd.merge(left, right, how='right', on=['key1', 'key2'])
print('Right')
result

### Join

In [None]:
# Create examples of frames

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])


right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])

result = left.join(right)
result

### Groupby

In [None]:
# Back to our dataset

df.head(10)

In [None]:
# Calculate how many girls and boys have a romantic relationship

df.groupby(by=['sex', 'romantic'])['romantic'].count()

In [None]:
# Calculate what proportion of girls and boys have a romantic relationship

df.groupby(by=['sex']).agg({'romantic': lambda x: x.value_counts(normalize=True)['yes']})

### pivot_table

In [None]:
# Calculate average rate of final grade depending on romantic realtionship and Internet availability by sex

p_table = df.pivot_table(
    values='G3', 
    columns='sex', 
    index=['romantic', 'internet'], 
    aggfunc='mean'
)

p_table

### MultiIndex

In [None]:
# Create an example of pivot table

p_table = df.pivot_table(
    values='G3', 
    columns='sex', 
    index=['Mjob', 'paid'], 
    aggfunc='mean'
)

p_table

In [None]:
# In general, we have 3-dimensional dataset. Look at indexes of pivot table

p_table.index

In [None]:
# Look at level names

p_table.index.names

In [None]:
# Getting values by column

p_table['F']

In [None]:
# You can play with grouping by changing the order of these functions

p_table.stack().swaplevel()

In [None]:
# Restack the table

p_table1 = p_table.unstack()

In [None]:
# Getting values by columns

p_table1['F']['no']

In [None]:
# Getting values by 1-level index

p_table.xs('at_home')

In [None]:
# Getting values by 1-level index

p_table.loc['at_home', :]

In [None]:
# Getting values by two levels index

p_table.loc[('at_home', 'yes'), :]

In [None]:
# Getting values by 2-level index

p_table.xs('yes', level='paid')

In [None]:
# Getting values by slicing

p_table.loc[(slice('b','other'), slice('yes', 'yes')), :]

In [None]:
# Getting values by slicing

idx = pd.IndexSlice

p_table.loc[idx['at_home':'other',['yes']], :]