In [150]:
import pandas as pd
import numpy as np

In [151]:
df = pd.read_excel("SampleData.xlsx", sheet_name= 'SalesOrders')
df

Unnamed: 0,OrderDate,Region,Rep,Item,Units,Unit Cost,Total
0,2019-01-06,East,Jones,Pencil,95,1.99,189.05
1,2019-01-23,Central,Kivell,Binder,50,19.99,999.5
2,2019-02-09,Central,Jardine,Pencil,36,4.99,179.64
3,2019-02-26,Central,Gill,Pen,27,19.99,539.73
4,2019-03-15,West,Sorvino,Pencil,56,2.99,167.44
5,2019-04-01,East,Jones,Binder,60,4.99,299.4
6,2019-04-18,Central,Andrews,Pencil,75,1.99,149.25
7,2019-05-05,Central,Jardine,Pencil,90,4.99,449.1
8,2019-05-22,West,Thompson,Pencil,32,1.99,63.68
9,2019-06-08,East,Jones,Binder,60,8.99,539.4


# showing all column names

In [152]:
df.columns

Index(['OrderDate', 'Region', 'Rep', 'Item', 'Units', 'Unit Cost', 'Total'], dtype='object')

# changing column names

In [153]:
df.columns = [x.upper() for x in df.columns]
df

# replacing all spaces with a underscore _

df.columns = df.columns.str.replace(" ","_")
df.loc[:0]

Unnamed: 0,ORDERDATE,REGION,REP,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06,East,Jones,Pencil,95,1.99,189.05


# renaming to few columns using df.rename

In [154]:
# changing orderdate, Rep to Order_date and Representative

df.rename(columns = {'ORDERDATE':"ORDER_DATE" , "REP":"REPRESENTATIVE"}, inplace=True)
df.loc[:0]

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06,East,Jones,Pencil,95,1.99,189.05


# updating rows

In [155]:
import datetime as dt

# updating 2nd row
df.loc[1] = [dt.datetime.today(),'West','Shailesh','Rubber',45,10,450.0]

df.loc[:2]    # see first 3 rows in which 2nd row gets updated.

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06 00:00:00.000000000,East,Jones,Pencil,95,1.99,189.05
1,2020-04-27 15:01:24.574708992,West,Shailesh,Rubber,45,10.0,450.0
2,2019-02-09 00:00:00.000000000,Central,Jardine,Pencil,36,4.99,179.64


In [156]:
# updating only few fields like date,region and item  in 5th row.

df.loc[4 , ['ORDER_DATE','REGION','ITEM']] = [dt.datetime.today(),'NE','Sharpner']

df.loc[4:4]


Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
4,2020-04-27 15:01:24.742722048,NE,Sorvino,Sharpner,56,2.99,167.44


# for updating we have some methods
- 1. apply
- 2. applymap
- 3. map
- 4. replace

## 1.apply
- df.apply(arg)  here we can pass a function.
- if called by a DataFrame apply function will work on each series.
- if called by a series then apply function works on each value.

In [157]:
df.loc[:0]    # a look at the data frame.

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06,East,Jones,Pencil,95,1.99,189.05


### example: finding length of each value in data frame.

In [158]:
df.apply(len)    # here we use the apply(len) function to a DataFrame. So the len 
                 # fucntion will be applied on each data series i.e each column here.
    
# so it will give the the no of entries in each column

ORDER_DATE        43
REGION            43
REPRESENTATIVE    43
ITEM              43
UNITS             43
UNIT_COST         43
TOTAL             43
dtype: int64

In [159]:
df['REGION'].apply(len)    # here we use the apply(len) function to a DataSeries. 
                           # So the len fucntion will be applied on each value of the 
                           # series i.e on each value of the particular column or row.
        
# so it will give the len of each value in that particular column

0     4
1     4
2     7
3     7
4     2
5     4
6     7
7     7
8     4
9     4
10    7
11    4
12    4
13    4
14    7
15    4
16    7
17    4
18    4
19    7
20    7
21    4
22    7
23    7
24    4
25    4
26    7
27    7
28    4
29    7
30    7
31    7
32    4
33    7
34    7
35    4
36    7
37    4
38    4
39    7
40    7
41    7
42    7
Name: REGION, dtype: int64

## 2.applymap
- df.apply(arg)  here we can pass a function.
- can be called by a dataframe only not a dataSeries.
- when called by a DataFrame the apply function will work on each values of the dataFrame


In [160]:
df.applymap(lambda x: len(str(x)))    # here applymap function is called by a dataFrame
                                      # and lambda function is applied on each and every
                                      # value in the dataFrame.

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,19,4,5,6,2,4,6
1,29,4,8,6,2,4,5
2,19,7,7,6,2,4,18
3,19,7,4,3,2,5,17
4,29,2,7,8,2,4,6
5,19,4,5,6,2,4,18
6,19,7,7,6,2,4,6
7,19,7,7,6,2,4,5
8,19,4,8,6,2,4,5
9,19,4,5,6,2,4,5


### change values of a dataFrame to lower/upper cases

In [161]:
df.applymap(lambda x: str(x).upper())
df

# no changes happen becoz numerical values in the dataFrame can't be uppercased.
# so we have to wrtie a function to tackle that problem look below example.

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06 00:00:00.000000000,East,Jones,Pencil,95,1.99,189.05
1,2020-04-27 15:01:24.574708992,West,Shailesh,Rubber,45,10.0,450.0
2,2019-02-09 00:00:00.000000000,Central,Jardine,Pencil,36,4.99,179.64
3,2019-02-26 00:00:00.000000000,Central,Gill,Pen,27,19.99,539.73
4,2020-04-27 15:01:24.742722048,NE,Sorvino,Sharpner,56,2.99,167.44
5,2019-04-01 00:00:00.000000000,East,Jones,Binder,60,4.99,299.4
6,2019-04-18 00:00:00.000000000,Central,Andrews,Pencil,75,1.99,149.25
7,2019-05-05 00:00:00.000000000,Central,Jardine,Pencil,90,4.99,449.1
8,2019-05-22 00:00:00.000000000,West,Thompson,Pencil,32,1.99,63.68
9,2019-06-08 00:00:00.000000000,East,Jones,Binder,60,8.99,539.4


In [162]:
def DFupperCase(x):             # function to convert only alphabets into upper case
    y=str(x)
    if(y.isalpha()):
        return y.upper()
    return x
    
df = df.applymap(DFupperCase)   # passing the function names only. it will automatically
                                # accpets the data values in the caller dataframe as 
                                # its arguments one by one.
df

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06 00:00:00.000000000,EAST,JONES,PENCIL,95,1.99,189.05
1,2020-04-27 15:01:24.574708992,WEST,SHAILESH,RUBBER,45,10.0,450.0
2,2019-02-09 00:00:00.000000000,CENTRAL,JARDINE,PENCIL,36,4.99,179.64
3,2019-02-26 00:00:00.000000000,CENTRAL,GILL,PEN,27,19.99,539.73
4,2020-04-27 15:01:24.742722048,NE,SORVINO,SHARPNER,56,2.99,167.44
5,2019-04-01 00:00:00.000000000,EAST,JONES,BINDER,60,4.99,299.4
6,2019-04-18 00:00:00.000000000,CENTRAL,ANDREWS,PENCIL,75,1.99,149.25
7,2019-05-05 00:00:00.000000000,CENTRAL,JARDINE,PENCIL,90,4.99,449.1
8,2019-05-22 00:00:00.000000000,WEST,THOMPSON,PENCIL,32,1.99,63.68
9,2019-06-08 00:00:00.000000000,EAST,JONES,BINDER,60,8.99,539.4


## 3.map
- df.apply(arg)  here we can pass a dictionary. Returns a new Series.
- can be called by a series only not by dataframe.
- used for substituting values from old to new. Values which are not changed are repalced with a NaN value.

In [163]:
# substituting two value using map function

df['ITEM'].map({'PENCIL':'myPencil' , 'RUBBER':'myRubber'})

# this will replace all PENCIL & RUBBER values with myPencil and myRubber and rest
# values with a NaN.

0     myPencil
1     myRubber
2     myPencil
3          NaN
4          NaN
5          NaN
6     myPencil
7     myPencil
8     myPencil
9          NaN
10    myPencil
11         NaN
12         NaN
13    myPencil
14         NaN
15         NaN
16         NaN
17         NaN
18         NaN
19         NaN
20    myPencil
21         NaN
22         NaN
23         NaN
24         NaN
25         NaN
26         NaN
27    myPencil
28         NaN
29    myPencil
30         NaN
31         NaN
32         NaN
33         NaN
34         NaN
35         NaN
36    myPencil
37         NaN
38         NaN
39    myPencil
40         NaN
41         NaN
42         NaN
Name: ITEM, dtype: object

## 3.replace
- df.apply(arg)  here we can pass a dictionary. it returns a new series.
- can be called by a series only not by dataframe.
- used for substituting values from old to new. Values which are not changed remains as it is unlike map.

In [166]:
df['ITEM'].replace({'PENCIL':'myPencil' , 'RUBBER':'myRubber'})


# in order to make these changes permanent we have to assign this to df
df['ITEM'] = df['ITEM'].replace({'PENCIL':'myPencil' , 'RUBBER':'myRubber'})

In [167]:
df

Unnamed: 0,ORDER_DATE,REGION,REPRESENTATIVE,ITEM,UNITS,UNIT_COST,TOTAL
0,2019-01-06 00:00:00.000000000,EAST,JONES,myPencil,95,1.99,189.05
1,2020-04-27 15:01:24.574708992,WEST,SHAILESH,myRubber,45,10.0,450.0
2,2019-02-09 00:00:00.000000000,CENTRAL,JARDINE,myPencil,36,4.99,179.64
3,2019-02-26 00:00:00.000000000,CENTRAL,GILL,PEN,27,19.99,539.73
4,2020-04-27 15:01:24.742722048,NE,SORVINO,SHARPNER,56,2.99,167.44
5,2019-04-01 00:00:00.000000000,EAST,JONES,BINDER,60,4.99,299.4
6,2019-04-18 00:00:00.000000000,CENTRAL,ANDREWS,myPencil,75,1.99,149.25
7,2019-05-05 00:00:00.000000000,CENTRAL,JARDINE,myPencil,90,4.99,449.1
8,2019-05-22 00:00:00.000000000,WEST,THOMPSON,myPencil,32,1.99,63.68
9,2019-06-08 00:00:00.000000000,EAST,JONES,BINDER,60,8.99,539.4
