In [2]:
# Reference: 
# online free docs:          https://pandas.pydata.org/pandas-docs/stable/
# book old edition free:  https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/
# book new edition pay: https://smile.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/

In [3]:
# pandas is a python library used for data manipulation and analysis
# two key data structures - series objects and dataframes

In [1]:
import numpy as np
import pandas as pd

In [2]:
##### extending dataframes - appending rows #####
# note: can also do this more generically with concat 

In [3]:
# appending dataframe to dataframe without ignoring index
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)))
df2 = pd.DataFrame(data=np.random.normal(size=(2,3)))
df = df1.append(df2)
print (df1)
print (df2)
print (df)

          0         1         2
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
          0         1         2
0  0.950088 -0.151357 -0.103219
1  0.410599  0.144044  1.454274
          0         1         2
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
0  0.950088 -0.151357 -0.103219
1  0.410599  0.144044  1.454274


In [4]:
# appending dataframe to dataframe with ignoring index
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)))
df2 = pd.DataFrame(data=np.random.normal(size=(2,3)))
df = df1.append(df2, ignore_index=True)
print (df1)
print (df2)
print (df)

          0         1         2
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
          0         1         2
0  0.950088 -0.151357 -0.103219
1  0.410599  0.144044  1.454274
          0         1         2
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
2  0.950088 -0.151357 -0.103219
3  0.410599  0.144044  1.454274


In [5]:
# appending dataframe to dataframe without ignoring index
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['X','Y'], columns=['aa','bb','cc'])
df2 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['Z','W'], columns=['aa','bb','cc'])
df = df1.append(df2)
print (df1)
print (df2)
print (df)

         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
         aa        bb        cc
Z  0.950088 -0.151357 -0.103219
W  0.410599  0.144044  1.454274
         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
Z  0.950088 -0.151357 -0.103219
W  0.410599  0.144044  1.454274


In [6]:
# appending dataframe to dataframe with ignoring index
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['X','Y'], columns=['aa','bb','cc'])
df2 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['Z','W'], columns=['aa','bb','cc'])
df = df1.append(df2, ignore_index=True)
print (df1)
print (df2)
print (df)

         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
         aa        bb        cc
Z  0.950088 -0.151357 -0.103219
W  0.410599  0.144044  1.454274
         aa        bb        cc
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
2  0.950088 -0.151357 -0.103219
3  0.410599  0.144044  1.454274


In [7]:
# appending series to dataframe without ignoring index - must include series name
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['X','Y'], columns=['aa','bb','cc'])
ser = pd.Series(data=np.random.normal(size=3), index=['aa','bb','cc'], name='Z')
df = df1.append(ser)
print (df1)
print (ser)
print (df)

         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
aa    0.950088
bb   -0.151357
cc   -0.103219
Name: Z, dtype: float64
         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
Z  0.950088 -0.151357 -0.103219


In [8]:
# appending series to dataframe with ignoring index - needn't include series name
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['X','Y'], columns=['aa','bb','cc'])
ser = pd.Series(data=np.random.normal(size=3), index=['aa','bb','cc'])
df = df1.append(ser,ignore_index=True)
print (df1)
print (ser)
print (df)

         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
aa    0.950088
bb   -0.151357
cc   -0.103219
dtype: float64
         aa        bb        cc
0  1.764052  0.400157  0.978738
1  2.240893  1.867558 -0.977278
2  0.950088 -0.151357 -0.103219


In [10]:
# note: when doing an append, columns not in the dataframe are added as new columns and missing values are set to NaN
np.random.seed(0)
df1 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['X','Y'], columns=['aa','bb','cc'])
df2 = pd.DataFrame(data=np.random.normal(size=(2,3)), index=['Z','W'], columns=['aa','bb','dd'])
df = df1.append(df2)
print (df1)
print (df2)
print (df)

         aa        bb        cc
X  1.764052  0.400157  0.978738
Y  2.240893  1.867558 -0.977278
         aa        bb        dd
Z  0.950088 -0.151357 -0.103219
W  0.410599  0.144044  1.454274
         aa        bb        cc        dd
X  1.764052  0.400157  0.978738       NaN
Y  2.240893  1.867558 -0.977278       NaN
Z  0.950088 -0.151357       NaN -0.103219
W  0.410599  0.144044       NaN  1.454274


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [11]:
##### extending dataframes - including columns #####
# note: can also do this more generically with concat 

In [12]:
np.random.seed(0)
df = pd.DataFrame(data=np.random.normal(size=(3,3)), index=['X','Y','Z'], columns=['aa','bb','cc'])
ser = pd.Series([1000,2000,3000], index=['X','Y','Z'])
df['gg'] = ser
df

Unnamed: 0,aa,bb,cc,gg
X,1.764052,0.400157,0.978738,1000
Y,2.240893,1.867558,-0.977278,2000
Z,0.950088,-0.151357,-0.103219,3000


In [13]:
df['aabb'] = df['aa'] + df['bb']
df

Unnamed: 0,aa,bb,cc,gg,aabb
X,1.764052,0.400157,0.978738,1000,2.16421
Y,2.240893,1.867558,-0.977278,2000,4.108451
Z,0.950088,-0.151357,-0.103219,3000,0.798731


In [14]:
##### dropping rows and columns #####

In [15]:
df = pd.DataFrame(data=np.random.normal(size=(3,5)), index=['X','Y','Z'], columns=['aa','bb','cc','dd','ee'])
df

Unnamed: 0,aa,bb,cc,dd,ee
X,0.410599,0.144044,1.454274,0.761038,0.121675
Y,0.443863,0.333674,1.494079,-0.205158,0.313068
Z,-0.854096,-2.55299,0.653619,0.864436,-0.742165


In [16]:
# dropping rows
df.drop('X',axis=0,inplace=True)
df

Unnamed: 0,aa,bb,cc,dd,ee
Y,0.443863,0.333674,1.494079,-0.205158,0.313068
Z,-0.854096,-2.55299,0.653619,0.864436,-0.742165


In [17]:
# dropping columns
df.drop(['aa','ee'],axis=1,inplace=True)
df

Unnamed: 0,bb,cc,dd
Y,0.333674,1.494079,-0.205158
Z,-2.55299,0.653619,0.864436


In [18]:
# dropping duplicates (index doesn't have to be same - just looking at values)
df = pd.DataFrame(data=[[1,2,3,4,5],[1,2,3,4,5],[10,20,30,40,50]],
                 index = list('XYZ'),
                 columns = list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
X,1,2,3,4,5
Y,1,2,3,4,5
Z,10,20,30,40,50


In [19]:
df.drop_duplicates()

Unnamed: 0,A,B,C,D,E
X,1,2,3,4,5
Z,10,20,30,40,50
