In [1]:
import pandas as pd
import numpy as np

# COMBINING w/ combine_first() ~ Series 

In [3]:
ser1 = pd.Series(np.arange(6,11), index=np.arange(5))
ser1

0     6
1     7
2     8
3     9
4    10
dtype: int64

In [4]:
ser2 = pd.Series(np.arange(9,13), index=[2,4,5,6])
ser2

2     9
4    10
5    11
6    12
dtype: int64

In [5]:
# Overlapping exists
# ser1[2] = 8
# ser1 is MAIN, ser2 is ADDITIONAL
ser1.combine_first(ser2)

0     6.0
1     7.0
2     8.0
3     9.0
4    10.0
5    11.0
6    12.0
dtype: float64

In [6]:
# ser2[2] = 9
# ser2 is MAIN, ser1 is ADDITIONAL
ser2.combine_first(ser1)

0     6.0
1     7.0
2     9.0
3     9.0
4    10.0
5    11.0
6    12.0
dtype: float64

In [7]:
# 2 first rows of ser2 overlap 3 first rows of ser1
ser2[:2].combine_first(ser1[:3])

0     6.0
1     7.0
2     9.0
4    10.0
dtype: float64

# REMOVING w/ del & drop()

In [15]:
f1 = pd.DataFrame({'ball':[0,3,6],
                  'pen':[1,4,7],
                  'pencil':[2,5,8],
                  },index=['white','black','red'])
f1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [18]:
# To delete COLOUMNS, using del 
# the changes are applied
del f1['ball']
f1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [21]:
# To delete ROWS, using drop() 
# the changes are NOT applied
f1.drop('white')

Unnamed: 0,pen,pencil
black,4,5
red,7,8


# DATA TRANSFORMATION: REMOVING DUPLICATES

In [5]:
df = pd.DataFrame({'color':['white','red','white','red','white'],
                  'value':[2,1,3,1,2]})
df

Unnamed: 0,color,value
0,white,2
1,red,1
2,white,3
3,red,1
4,white,2


In [6]:
# Return 'TRUE' if it is a duplicate
df.duplicated()

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [7]:
# Return duplicated rows
df[df.duplicated()]

Unnamed: 0,color,value
3,red,1
4,white,2


In [8]:
df.drop_duplicates()

Unnamed: 0,color,value
0,white,2
1,red,1
2,white,3


# MAPPING via replace(), map(), rename()

In [9]:
frame = pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],
                     'color':['white','rosso','verde','black','yellow'],
                     'price':[5.56,4.20,1.30,0.56,2.75]})
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [10]:
# Replace() using dictionary
newcolors = {'rosso':'red','verde':'green'}
frame.replace(newcolors)

Unnamed: 0,color,item,price
0,white,ball,5.56
1,red,mug,4.2
2,green,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [11]:
ser = pd.Series([1,3,np.nan,4.6,np.nan,3])
ser

0    1.0
1    3.0
2    NaN
3    4.6
4    NaN
5    3.0
dtype: float64

In [12]:
ser.replace(np.nan,0)

0    1.0
1    3.0
2    0.0
3    4.6
4    0.0
5    3.0
dtype: float64

In [13]:
# Adding values via MAP()
frame

Unnamed: 0,color,item,price
0,white,ball,5.56
1,rosso,mug,4.2
2,verde,pen,1.3
3,black,pencil,0.56
4,yellow,ashtray,2.75


In [20]:
VAT = {'ball':0.34,'mug':.42,'pen':.13,'pencil':.05,'ashtray':.3}
frame['VAT'] = frame['item'].map(VAT)
frame

Unnamed: 0,color,item,price,VAT
0,white,ball,5.56,0.34
1,rosso,mug,4.2,0.42
2,verde,pen,1.3,0.13
3,black,pencil,0.56,0.05
4,yellow,ashtray,2.75,0.3


# INDEXING

In [3]:
frame = pd.DataFrame({'color':['blue','green','yellow','red','white'],
                     'object':['ball','pen','pencil','paper','mug'],
                     'price':[1.2,1,.6,.9,1.7]})
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [6]:
frame.index = frame['color']
frame

Unnamed: 0_level_0,color,object,price
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
blue,blue,ball,1.2
green,green,pen,1.0
yellow,yellow,pencil,0.6
red,red,paper,0.9
white,white,mug,1.7
