In [1]:
import numpy as np
import pandas as pd
print('numpy  version:', np.__version__)
print('pandas version:', pd.__version__)

#rand int
#5 rows, 3 cols, each value is rand integer between [1, 10)
#np.random.randint(1, 10, (5, 3))

data = [[3, 5, 1],
       [9, 1, 7],
       [5, 3, 7],
       [6, 8, 6],
       [9, 4, 3]]
df = pd.DataFrame(data, columns=['col1', 'col2', 'col3'])

print('#dataframe example\n')
print(df)

numpy  version: 1.18.1
pandas version: 1.0.3
#dataframe example

   col1  col2  col3
0     3     5     1
1     9     1     7
2     5     3     7
3     6     8     6
4     9     4     3


In [2]:
##dataframe

#Function application

def adder(dataframe_self, arg2):
    return dataframe_self + arg2

#pipe(): table wise function application
#        dataframe itself will be passed to function as first argument
print('\n#dataframe.pipe()\n')
print(df.pipe(adder, 2))

#apply(): row or column wise function application
#         takes an optional axis argument, default is 0, eg. column wise
print('\n#dataframe.apply(axis=0)\n')
print(df.apply(np.mean, axis=0))

#applaymap(): element wise function application
#             accept a fuction taking a single value and returning a single value
print('\n#dataframe.applymap()\n')
print(df.applymap(lambda x:x*100))




#dataframe.pipe()

   col1  col2  col3
0     5     7     3
1    11     3     9
2     7     5     9
3     8    10     8
4    11     6     5

#dataframe.apply(axis=0)

col1    6.4
col2    4.2
col3    4.8
dtype: float64

#dataframe.applymap()

   col1  col2  col3
0   300   500   100
1   900   100   700
2   500   300   700
3   600   800   600
4   900   400   300


In [24]:
#Sort
data = np.random.randint(1, 10, (10,2))
unsorted_df = pd.DataFrame(data
                           ,index=[1,4,6,2,3,5,9,8,0,7]
                           ,columns = ['col2','col1'])


#sort by row label, ascending order
unsorted_df.sort_index()

#sort by descending order
unsorted_df.sort_index(ascending=False)

#sort by column label, axis parameter default is 0
unsorted_df.sort_index(axis=1)

#sort by values

#parameter by take a single column name
unsorted_df.sort_values(by='col1')

#parameter by take a list of column names
unsorted_df.sort_values(by=['col1', 'col2'])





In [49]:
#concatenate

one = pd.DataFrame({
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5'],
   'Marks_scored':[98,90,87,69,78]},
   index=[1,2,3,4,5])

two = pd.DataFrame({
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5'],
   'Marks_scored':[89,80,79,97,88]},
   index=[1,2,3,4,5])

one.append(two)

#如果index重复，则抛异常
try:
    one.append(two, verify_integrity=True)
except Exception as e:
    print(e)
    

#忽略原有的index，重新创建新的顺序index
one.append(two, ignore_index=True)


Indexes have overlapping values: Int64Index([1, 2, 3, 4, 5], dtype='int64')



In [26]:

# set index

# df has a sequence integer index after created
df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale': [55, 40, 84, 31]})

# set one column as index
df.set_index('year')

# use the column as index, and keep the column
df.set_index('year', drop=False)

# set pandas Index as new index directly,
# the length of new index must match number of rows
df.set_index(pd.Index([4, 3, 2, 1]))


# reset index

one = pd.DataFrame({
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5'],
   'Marks_scored':[98,90,87,69,78]},
   index=[1,2,3,4,5])

two = pd.DataFrame({
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5'],
   'Marks_scored':[89,80,79,97,88]},
   index=[1,2,3,4,5])


df = one.append(two) #df has duplicate index

# 创建新的顺序index, 原来的index被作为新的列插入
df.reset_index()

# 创建新的顺序index, 原来的index被彻底丢弃
df.reset_index(drop=True)
print()






In [27]:
#select rows/columns

#select by rows/columns names
#DataFrame.loc[<ROWS RANGE>, <COLUMNS RANGE>]

#select by rows/columns by index positions
#DataFrame.iloc[<ROWS INDEX RANGE>, <COLUMNS INDEX RANGE>]

students = [ ('jack', 34, 'Sydeny') ,
             ('Riti', 30, 'Delhi' ) ,
             ('Aadi', 16, 'New York') ]


df = pd.DataFrame(students, columns=['Name', 'Age', 'City'], index=['a', 'b', 'c'])

# use a single ':' to select all rows/ columns
# select a single column Age, return a Series object with same indexes as df
df.loc[:, 'Age']

#select multiple columns
df.loc[:, ['Age', 'Name']]

#select a single row
df.loc['b', :]

#select mutiple rows
df.loc[['c', 'b'], :]

#select by rows and colums
df.loc[['c', 'b'], ['Age', 'Name']]

#select by row/colums range
#返回结果包含'c'行与'City'列
df.loc['a':'c', 'Age':'City']

#select by row/colums index range
#返回结果不包含第2行(0,1,2)，不包含第2列(0,1,2)，这与loc方法的行为有差别
df.iloc[0:2, 1:2]

#如果只根据列名进行选择，则可以使用DataFrame的下标操作符[]
df['Age']
df[['Name', 'Age']]


Unnamed: 0,Name,Age
a,jack,34
b,Riti,30
c,Aadi,16
