In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
s1 = pd.Series({'b':3, 'd':2, 'a':1, 'c':4})

In [4]:
s1

b    3
d    2
a    1
c    4
dtype: int64

In [5]:
s1.array

<PandasArray>
[3, 2, 1, 4]
Length: 4, dtype: int64

In [6]:
s1.sort_index()

a    1
b    3
c    4
d    2
dtype: int64

In [7]:
s1

b    3
d    2
a    1
c    4
dtype: int64

In [8]:
s1[['a', 'c']]

a    1
c    4
dtype: int64

In [9]:
dict1 = {
            'ticker':['Apple', 'Google', 'Amazon', 'Tesla', 'Shell'],
            'value':[11,22,33,44,55]
        }

In [10]:
df = pd.DataFrame(dict1)

In [11]:
df

Unnamed: 0,ticker,value
0,Apple,11
1,Google,22
2,Amazon,33
3,Tesla,44
4,Shell,55


In [12]:
df = df.set_index('ticker')

In [13]:
df

Unnamed: 0_level_0,value
ticker,Unnamed: 1_level_1
Apple,11
Google,22
Amazon,33
Tesla,44
Shell,55


In [16]:
df.loc['Apple', :] # this returns results with datatype not clear

value    11
Name: Apple, dtype: int64

In [17]:
df.loc[['Apple'], :] # this returns results with datatype as DataFrame

Unnamed: 0_level_0,value
ticker,Unnamed: 1_level_1
Apple,11


In [None]:
# Hence, always preferable to use lists ['Apple'] when referencing rows in .loc as in the above example

In [18]:
df

Unnamed: 0_level_0,value
ticker,Unnamed: 1_level_1
Apple,11
Google,22
Amazon,33
Tesla,44
Shell,55


In [32]:
df1 = pd.DataFrame(np.random.randint(1,100,(3,4)), index=list('abc'), columns=list('pqrs'))

print(df1)

df1.loc['a'] # compare this with below

    p   q   r   s
a  63  17  68  23
b  10  45  21  91
c  10  32  82  60


p    63
q    17
r    68
s    23
Name: a, dtype: int32

In [29]:
df1.loc[['a']]

Unnamed: 0,p,q,r,s
a,18,5,39,39


In [30]:
df1.loc['a', 'p'] # compare this with below

18

In [31]:
df1.loc[['a'], ['q']]

Unnamed: 0,q
a,5


In [19]:
df.reset_index(level=0, inplace=True)

df

Unnamed: 0,ticker,value
0,Apple,11
1,Google,22
2,Amazon,33
3,Tesla,44
4,Shell,55


In [20]:
hq_series = pd.Series(['USA','Ireland','Seattle','New York','LA'], index=list('abcde'))

hq_series

a         USA
b     Ireland
c     Seattle
d    New York
e          LA
dtype: object

In [21]:
df['hq'] = hq_series

df # in this case none of the indices for values in hq_series matched with indices of df, hence 'NaN' was used in df

Unnamed: 0,ticker,value,hq
0,Apple,11,
1,Google,22,
2,Amazon,33,
3,Tesla,44,
4,Shell,55,


In [22]:
# In case you want to brute-force the series to be attached to the df, irrespective of mismatched indices, see below:

In [23]:
df['hq'] = hq_series.array # brute-forcing series to fit into df

df

Unnamed: 0,ticker,value,hq
0,Apple,11,USA
1,Google,22,Ireland
2,Amazon,33,Seattle
3,Tesla,44,New York
4,Shell,55,LA


#### Split, Apply, Combine

In [33]:
df = pd.read_csv('titanic.csv')

In [34]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [35]:
grouped = df.groupby('pclass')

grouped.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [36]:
grouped.mean()

Unnamed: 0_level_0,survived,age,sibsp,parch,fare,adult_male,alone
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0.62963,38.233441,0.416667,0.356481,84.154687,0.550926,0.50463
2,0.472826,29.87763,0.402174,0.380435,20.662183,0.538043,0.565217
3,0.242363,25.14062,0.615071,0.393075,13.67555,0.649695,0.659878


In [37]:
grouped['survived'].mean()

pclass
1    0.629630
2    0.472826
3    0.242363
Name: survived, dtype: float64

In [38]:
grouped[['survived']].mean()

Unnamed: 0_level_0,survived
pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [39]:
grouped[['age', 'fare']].mean()

Unnamed: 0_level_0,age,fare
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.233441,84.154687
2,29.87763,20.662183
3,25.14062,13.67555


In [40]:
grouped[['age']].agg([np.mean, np.median])

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,mean,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,38.233441,37.0
2,29.87763,29.0
3,25.14062,24.0


In [41]:
grouped[['fare']].agg([np.mean, np.median])

Unnamed: 0_level_0,fare,fare
Unnamed: 0_level_1,mean,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2
1,84.154687,60.2875
2,20.662183,14.25
3,13.67555,8.05


In [42]:
grouped[['fare', 'age']].agg([np.mean, np.median])

Unnamed: 0_level_0,fare,fare,age,age
Unnamed: 0_level_1,mean,median,mean,median
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,84.154687,60.2875,38.233441,37.0
2,20.662183,14.25,29.87763,29.0
3,13.67555,8.05,25.14062,24.0


In [43]:
grouped.agg(
            mean_age = ('age', lambda x: x.mean()),
            std_fare = ('fare', np.std)
            )

Unnamed: 0_level_0,mean_age,std_fare
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.233441,78.380373
2,29.87763,13.417399
3,25.14062,11.778142


In [44]:
grouped2 = df.groupby(['pclass', 'sex'])

In [45]:
grouped2.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,age,sibsp,parch,fare,adult_male,alone
pclass,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,female,0.968085,34.611765,0.553191,0.457447,106.125798,0.0,0.361702
1,male,0.368852,41.281386,0.311475,0.278689,67.226127,0.97541,0.614754
2,female,0.921053,28.722973,0.486842,0.605263,21.970121,0.0,0.421053
2,male,0.157407,30.740707,0.342593,0.222222,19.741782,0.916667,0.666667
3,female,0.5,21.75,0.895833,0.798611,16.11881,0.0,0.416667
3,male,0.135447,26.507589,0.498559,0.224784,12.661633,0.919308,0.760807


In [47]:
grouped2[['survived', 'age']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,age
pclass,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1,female,0.968085,34.611765
1,male,0.368852,41.281386
2,female,0.921053,28.722973
2,male,0.157407,30.740707
3,female,0.5,21.75
3,male,0.135447,26.507589


#### Time Series

In [48]:
import datetime as dt

In [49]:
ts = pd.Timestamp('2020-05-05 18:00')

In [50]:
ts

Timestamp('2020-05-05 18:00:00')

In [51]:
type(ts)

pandas._libs.tslibs.timestamps.Timestamp

In [52]:
s1 = pd.Series(109, index=[ts])

In [53]:
s1

2020-05-05 18:00:00    109
dtype: int64

In [54]:
s1.index

DatetimeIndex(['2020-05-05 18:00:00'], dtype='datetime64[ns]', freq=None)

In [55]:
# Create a year's dates

dates = pd.date_range('2020-01-01', '2020-12-31', freq='B') # 'B' indicates Business Calendar

type(dates)

pandas.core.indexes.datetimes.DatetimeIndex

In [56]:
dates[0:5]

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-06',
               '2020-01-07'],
              dtype='datetime64[ns]', freq='B')

In [57]:
ts = pd.Series(np.random.randn(len(dates)), index=[dates])

In [58]:
type(ts)

pandas.core.series.Series

In [59]:
ts.head()

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-06    0.817405
2020-01-07   -0.502653
dtype: float64

In [60]:
ts['2020-01-01']

2020-01-01   -3.975908
dtype: float64

In [61]:
ts.loc['2020-01-01']

2020-01-01   -3.975908
dtype: float64

In [62]:
ts.loc[['2020-01-01']]

2020-01-01   -3.975908
dtype: float64

In [65]:
ts.loc['2020-01-01':'2020-01-07']

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-06    0.817405
2020-01-07   -0.502653
dtype: float64

In [66]:
ts.loc['2020-01'].head() # selecting by month

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-06    0.817405
2020-01-07   -0.502653
dtype: float64

In [67]:
ts.loc['2020'].head() # selecting by year

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-06    0.817405
2020-01-07   -0.502653
dtype: float64

In [68]:
# Performing lag / lead operations on time-series

In [None]:
# shifting by values

In [76]:
ts2 = ts.loc['2020-01'].copy()

In [77]:
type(ts)

pandas.core.series.Series

In [78]:
type(ts2)

pandas.core.series.Series

In [79]:
ts2[:5]

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-06    0.817405
2020-01-07   -0.502653
dtype: float64

In [82]:
ts2.iloc[[0, 1, 2, -2, -1]]

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-30   -1.405472
2020-01-31   -1.085213
dtype: float64

In [85]:
ts2.shift(1).iloc[[0, 1, 2, -2, -1]]

2020-01-01         NaN
2020-01-02   -3.975908
2020-01-03    1.592756
2020-01-30   -0.223787
2020-01-31   -1.405472
dtype: float64

In [86]:
ts2.shift(-1).iloc[[0, 1, 2, -2, -1]]

2020-01-01    1.592756
2020-01-02   -0.481557
2020-01-03    0.817405
2020-01-30   -1.085213
2020-01-31         NaN
dtype: float64

In [None]:
# shifting by index

In [None]:
ts2.tshift(2).iloc[[0, 1, 2, -2, -1]] # this is giving error

In [100]:
# re-sampling is similar to grouping

In [105]:
ts.head()

2020-01-01   -3.975908
2020-01-02    1.592756
2020-01-03   -0.481557
2020-01-06    0.817405
2020-01-07   -0.502653
dtype: float64

In [None]:
group = ts.resample('M') # giving error

In [107]:
# rolling calculations

In [108]:
roll = ts.rolling(2) # consider every 2 rolling observations

In [109]:
roll.mean() # calculate mean

2020-01-01         NaN
2020-01-02   -1.191576
2020-01-03    0.555599
2020-01-06    0.167924
2020-01-07    0.157376
                ...   
2020-12-25    0.626169
2020-12-28   -0.577985
2020-12-29   -0.264689
2020-12-30    0.731337
2020-12-31    0.965245
Length: 262, dtype: float64

In [112]:
roll.agg(['sum', 'mean'])

Unnamed: 0,sum,mean
2020-01-01,,
2020-01-02,-2.383152,-1.191576
2020-01-03,1.111199,0.555599
2020-01-06,0.335848,0.167924
2020-01-07,0.314752,0.157376
...,...,...
2020-12-25,1.252338,0.626169
2020-12-28,-1.155971,-0.577985
2020-12-29,-0.529378,-0.264689
2020-12-30,1.462674,0.731337


#### Pivot Tables

In [113]:
df = pd.read_csv('titanic.csv')

In [114]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [115]:
pd.pivot_table(df, index=['pclass','sex'], values='fare')

Unnamed: 0_level_0,Unnamed: 1_level_0,fare
pclass,sex,Unnamed: 2_level_1
1,female,106.125798
1,male,67.226127
2,female,21.970121
2,male,19.741782
3,female,16.11881
3,male,12.661633


In [116]:
# aggregate functions on pivot

pd.pivot_table(
                df, values='fare', index=['pclass'], columns=['sex'],
                aggfunc = lambda x: x.max()-x.mean()
                )

sex,female,male
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,406.203402,445.103073
2,43.029879,53.758218
3,53.43119,56.888367


In [117]:
pd.pivot_table(
                df, values='fare', index=['pclass'], columns=['sex'],
                aggfunc = [np.mean, np.std]
                )

Unnamed: 0_level_0,mean,mean,std,std
sex,female,male,female,male
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,106.125798,67.226127,74.259988,77.548021
2,21.970121,19.741782,10.891796,14.922235
3,16.11881,12.661633,11.690314,11.681696


In [122]:
# crosstab for categorical data

In [123]:
pd.crosstab(df['pclass'], df['sex']) # performs count

sex,female,male
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,94,122
2,76,108
3,144,347


In [133]:
pd.crosstab(df['pclass'], df['sex'], normalize=True, margins=True)

sex,female,male,All
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.105499,0.136925,0.242424
2,0.085297,0.121212,0.20651
3,0.161616,0.38945,0.551066
All,0.352413,0.647587,1.0


In [131]:
pd.crosstab(df['pclass'], [df['sex'],df['survived']])

sex,female,female,male,male
survived,0,1,0,1
pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3,91,77,45
2,6,70,91,17
3,72,72,300,47
