In [2]:
import pandas as pd
import numpy as np

### Creating DataFrames 

In [22]:
data = [[100,200], [300,400]]
df = pd.DataFrame(data=data, columns=['A', 'B'])
df

Unnamed: 0,A,B
0,100,200
1,300,400


In [23]:
df.dtypes

A    int64
B    int64
dtype: object

In [26]:
df = pd.DataFrame(data=data, columns=['A', 'B'], dtype=np.int32)
df

Unnamed: 0,A,B
0,100,200
1,300,400


In [28]:
df.dtypes

A    int32
B    int32
dtype: object

In [21]:
df = pd.DataFrame(data={'A':[100,200], 'B':[300,400]})
df

Unnamed: 0,A,B
0,100,300
1,200,400


In [33]:
pd.DataFrame(data=np.array([[2,3], [2,3]]), columns=['A', 'B'])

Unnamed: 0,A,B
0,2,3
1,2,3


### DataFrame.index

In [35]:
df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'],
                   'Age': [25, 30, 35],
                   'Location': ['Seattle', 'New York', 'Kona']})
df

Unnamed: 0,Name,Age,Location
0,Alice,25,Seattle
1,Bob,30,New York
2,Aritra,35,Kona


In [36]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [39]:
df.index = [10,20,30]
df

Unnamed: 0,Name,Age,Location
10,Alice,25,Seattle
20,Bob,30,New York
30,Aritra,35,Kona


In [40]:
df.columns

Index(['Name', 'Age', 'Location'], dtype='object')

In [41]:
df.dtypes

Name        object
Age          int64
Location    object
dtype: object

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 10 to 30
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      3 non-null      object
 1   Age       3 non-null      int64 
 2   Location  3 non-null      object
dtypes: int64(1), object(2)
memory usage: 96.0+ bytes


In [54]:
df.select_dtypes('object')

Unnamed: 0,Name,Location
10,Alice,Seattle
20,Bob,New York
30,Aritra,Kona


In [55]:
df.select_dtypes(include='int64')

Unnamed: 0,Age
10,25
20,30
30,35


In [56]:
df = pd.DataFrame({'age':    [ 3,  29],
                   'height': [94, 170],
                   'weight': [31, 115],
                  'name':['John', 'Paul']})
df

Unnamed: 0,age,height,weight,name
0,3,94,31,John
1,29,170,115,Paul


In [57]:
df.values

array([[3, 94, 31, 'John'],
       [29, 170, 115, 'Paul']], dtype=object)

In [58]:
df.values[0]

array([3, 94, 31, 'John'], dtype=object)

In [60]:
df.axes

[RangeIndex(start=0, stop=2, step=1),
 Index(['age', 'height', 'weight', 'name'], dtype='object')]

In [70]:
if df.ndim == 1: 
    print('Series')
elif df.ndim == 2:
    print('DataFrame')

DataFrame


In [71]:
df.size

8

In [72]:
df.shape

(2, 4)

In [73]:
df

Unnamed: 0,age,height,weight,name
0,3,94,31,John
1,29,170,115,Paul


In [74]:
df.empty

False

### changing types DataFrame.astype vs pd.to_numeric

In [76]:
df = pd.DataFrame(data={'age':[1,2,40,50,27, 'twenty-seven', 23]})
df

Unnamed: 0,age
0,1
1,2
2,40
3,50
4,27
5,twenty-seven
6,23


In [77]:
df.dtypes

age    object
dtype: object

In [83]:
df2 = df
df2 = df2.astype('int32', errors='ignore')
df2

Unnamed: 0,age
0,1
1,2
2,40
3,50
4,27
5,twenty-seven
6,23


In [85]:
df2.dtypes

age    object
dtype: object

In [86]:
pd.to_numeric(df2['age'], errors='coerce')

0     1.0
1     2.0
2    40.0
3    50.0
4    27.0
5     NaN
6    23.0
Name: age, dtype: float64

In [88]:
df2 = df2.convert_dtypes()
df2.dtypes

age    object
dtype: object

### DataFrame.pipe

In [3]:
data = [[8000,1000], [95, np.nan], [5000,2000]]
df = pd.DataFrame(data=data, columns=['Salary', 'Others'])

In [4]:
df.head()

Unnamed: 0,Salary,Others
0,8000,1000.0
1,95,
2,5000,2000.0


In [16]:
def substract_tax(df):
    return df * 0.77

def substract_customs(df):
    return df * 0.95

def substract_custom_percentage(df, percentage):
    return df - (df * percentage)

In [17]:
df.pipe(substract_tax).pipe(substract_customs).pipe(substract_custom_percentage, percentage=0.01)

Unnamed: 0,Salary,Others
0,5793.48,724.185
1,68.797575,
2,3620.925,1448.37


### copy

In [132]:
df = pd.DataFrame({'A':[1,2,3,4], 'B':[5,6,7,8]})
df

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


In [133]:
df2 = df.copy(deep=True)

In [134]:
df3 = df.copy(deep=False)

In [135]:
df.loc[0, 'A'] = 11

In [136]:
df

Unnamed: 0,A,B
0,11,5
1,2,6
2,3,7
3,4,8


In [137]:
df2

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


In [138]:
df3

Unnamed: 0,A,B
0,11,5
1,2,6
2,3,7
3,4,8


In [139]:
df.head(1)

Unnamed: 0,A,B
0,11,5


In [140]:
df.tail(1)

Unnamed: 0,A,B
3,4,8


In [141]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [144]:
df2

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


In [145]:
df2['A']

0    1
1    2
2    3
3    4
Name: A, dtype: int64

In [146]:
df2 = df2.rename(columns={'A':'AA'})

In [147]:
df2

Unnamed: 0,AA,B
0,1,5
1,2,6
2,3,7
3,4,8


In [148]:
df2

Unnamed: 0,AA,B
0,1,5
1,2,6
2,3,7
3,4,8


In [149]:
df2.loc[4] = [7,7]

In [150]:
df2

Unnamed: 0,AA,B
0,1,5
1,2,6
2,3,7
3,4,8
4,7,7


In [151]:
df2 = df2.drop('B', axis=1)

In [152]:
df2

Unnamed: 0,AA
0,1
1,2
2,3
3,4
4,7


In [153]:
df2.insert(1, 'B', np.array([1,2,3,4,7]))

In [154]:
type(df2)

pandas.core.frame.DataFrame

In [155]:
df2

Unnamed: 0,AA,B
0,1,1
1,2,2
2,3,3
3,4,4
4,7,7


In [158]:
df2 = df2.drop(1, axis=0)

In [159]:
df2

Unnamed: 0,AA,B
0,1,1
2,3,3
3,4,4
4,7,7


In [164]:
df2.loc[df2['AA']*2==8, ['AA']]='test'

In [165]:
df2

Unnamed: 0,AA,B
0,1,1
2,3,3
3,test,4
4,7,7


In [166]:
df2.to_string()

'     AA  B\n0     1  1\n2     3  3\n3  test  4\n4     7  7'

In [168]:
df2 = df2.apply(lambda x: x*2)

In [169]:
df2.applymap(lambda x: x*2)

Unnamed: 0,AA,B
0,4,4
2,12,12
3,testtesttesttest,16
4,28,28


In [170]:
df2 = df2.drop(3, axis=0)

In [171]:
df2

Unnamed: 0,AA,B
0,2,2
2,6,6
4,14,14


In [172]:
df2.agg({'AA':['min', 'max', 'mean']})

Unnamed: 0,AA
min,2.0
max,14.0
mean,7.333333


In [173]:
df2

Unnamed: 0,AA,B
0,2,2
2,6,6
4,14,14


In [174]:
df2.iloc[2:]

Unnamed: 0,AA,B
4,14,14


### concat

In [175]:
df1 = pd.DataFrame(

    {

        "A": ["A0", "A1", "A2", "A3"],

        "B": ["B0", "B1", "B2", "B3"],

        "C": ["C0", "C1", "C2", "C3"],

        "D": ["D0", "D1", "D2", "D3"],

    },

    index=[0, 1, 2, 3],

)

In [176]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [177]:
df2 = pd.DataFrame(

    {

        "A": ["A4", "A5", "A6", "A7"],

        "B": ["B4", "B5", "B6", "B7"],

        "C": ["C4", "C5", "C6", "C7"],

        "D": ["D4", "D5", "D6", "D7"],

    },

    index=[4, 5, 6, 7],

)

In [178]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [180]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [189]:
pd.concat([df1,df2], axis=0)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
x,5,6,,
y,7,8,,


In [190]:
df3 = pd.DataFrame(

    {

        "A": ["A0", "A1", "A2", "A3"],

        "B": ["B0", "B1", "B2", "B3"],

        "C": ["C0", "C1", "C2", "C3"],

        "D": ["D0", "D1", "D2", "D3"],

    },

    index=[0, 1, 2, 3],

)

In [191]:
df3

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [192]:
pd.concat([df1,df3], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,A0,B0,C0,D0
1,A1,B1,C1,D1,A1,B1,C1,D1
2,A2,B2,C2,D2,A2,B2,C2,D2
3,A3,B3,C3,D3,A3,B3,C3,D3


### merge

In [193]:
left = pd.DataFrame(

    {

        "key": ["K0", "K1", "K2", "K3"],

        "A": ["A0", "A1", "A2", "A3"],

        "B": ["B0", "B1", "B2", "B3"],

    }

)

In [194]:
right = pd.DataFrame(

    {

        "key": ["K0", "K1", "K2", "K3"],

        "C": ["C0", "C1", "C2", "C3"],

        "D": ["D0", "D1", "D2", "D3"],

    }

)

In [195]:
left

Unnamed: 0,key,A,B
0,K0,A0,B0
1,K1,A1,B1
2,K2,A2,B2
3,K3,A3,B3


In [196]:
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K3,C3,D3


In [197]:
pd.merge(left=left, right=right, on='key', how='left')

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


### pivot, melt

In [208]:
data = {

   "value": range(12),

   "variable": ["A"] * 3 + ["B"] * 3 + ["C"] * 3 + ["D"] * 3,

   "date": pd.to_datetime(["2020-01-03", "2020-01-04", "2020-01-05"] * 4)

}

df = pd.DataFrame(data)

In [209]:
df

Unnamed: 0,value,variable,date
0,0,A,2020-01-03
1,1,A,2020-01-04
2,2,A,2020-01-05
3,3,B,2020-01-03
4,4,B,2020-01-04
5,5,B,2020-01-05
6,6,C,2020-01-03
7,7,C,2020-01-04
8,8,C,2020-01-05
9,9,D,2020-01-03


In [210]:
df2 = df.pivot_table(index='variable', values='value', columns='date')

In [211]:
df2

date,2020-01-03,2020-01-04,2020-01-05
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,1,2
B,3,4,5
C,6,7,8
D,9,10,11


In [215]:
df = pd.DataFrame(

    {

        "first": ["John", "Mary"],

        "last": ["Doe", "Bo"],

        "height": [5.5, 6.0],

        "weight": [130, 150],

    }

)

In [216]:
df

Unnamed: 0,first,last,height,weight
0,John,Doe,5.5,130
1,Mary,Bo,6.0,150


In [219]:
df.melt(id_vars=['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


### cumsum

In [221]:
df = pd.DataFrame(data={'A':[1,2,3,4]})

In [223]:
df['B'] = df['A'].cumsum()

In [224]:
df

Unnamed: 0,A,B
0,1,1
1,2,3
2,3,6
3,4,10


In [225]:
pd.crosstab(df['A'], df['B'])

B,1,3,6,10
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0,0
2,0,1,0,0
3,0,0,1,0
4,0,0,0,1
