# Pandas

Requires cars.csv and cars_to_csv.csv files

In [173]:
import pandas as pd

In [174]:
print(pd.__version__)

1.3.4


Pandas has two main data structures: 

- Series
- Dataframes

### Series

A Pandas Series is like a column in a table. It is a one-dimensional array holding data of any type.

- Homogeneous data
- Size Immutable
- Values of Data Mutable

In [175]:
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    2
dtype: int64


In [175]:
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)

0    1
1    7
2    2
dtype: int64


In [176]:
a = [1, 7, 2]
myvar = pd.Series(a, index = ["x", "y", "z"])
print(myvar)

x    1
y    7
z    2
dtype: int64


In [177]:
# Note: The keys of the dictionary become the labels(index)

calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories)
print(myvar)

day1    420
day2    380
day3    390
dtype: int64


In [178]:
calories = {"day1": 420, "day2": 380, "day3": 390}
myvar = pd.Series(calories, index = ["day1", "day2", "day4"])
print(myvar)

day1    420.0
day2    380.0
day4      NaN
dtype: float64


### DataFrame

Series is like a column, a DataFrame is the whole table.

Data sets in Pandas are usually multi-dimensional tables, called DataFrames.

It is a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns and heterogeneous data, with rows having an index each and columns having meaningful names.

- Heterogeneous data
- Size Mutable
- Data Mutable

In [179]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])
df

Unnamed: 0,Name,Age
rank1,Tom,28
rank2,Jack,34
rank3,Steve,29
rank4,Ricky,42


#### Creating datframe from a dictionary

pd.DataFrame(dictionary_name) 

The ‘key’ in the dictionary acts as the column name and the ‘values’ stored are the entries under the column. 

In [180]:
mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2


In [181]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object
myvar = pd.DataFrame(data)
print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


In [182]:
# Data in lists
# Construct dictionary
# Create dataframe from a dictionary

In [183]:
cars_per_cap = [809, 731, 588, 18, 200, 70, 45]
country = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
drives_right = [True, False, False, False, True, True, True]

In [184]:
data = {"cars_per_cap": cars_per_cap, "country": country, "drives_right": drives_right}

In [185]:
data

{'cars_per_cap': [809, 731, 588, 18, 200, 70, 45],
 'country': ['United States',
  'Australia',
  'Japan',
  'India',
  'Russia',
  'Morocco',
  'Egypt'],
 'drives_right': [True, False, False, False, True, True, True]}

In [186]:
cars = pd.DataFrame(data)
cars

Unnamed: 0,cars_per_cap,country,drives_right
0,809,United States,True
1,731,Australia,False
2,588,Japan,False
3,18,India,False
4,200,Russia,True
5,70,Morocco,True
6,45,Egypt,True


In [187]:
# Refers to the row index
# Returns a Pandas Series

print(cars.loc[5])

cars_per_cap         70
country         Morocco
drives_right       True
Name: 5, dtype: object


In [188]:
print(cars.loc[[5]])

   cars_per_cap  country  drives_right
5            70  Morocco          True


In [189]:
# When using [], the result is a Pandas DataFrame.

print(cars.loc[[0, 1]])

   cars_per_cap        country  drives_right
0           809  United States          True
1           731      Australia         False


In [190]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df) 

      calories  duration
day1       420        50
day2       380        40
day3       390        45


In [191]:
# Refer to the named index:
print(df.loc["day2"])

calories    380
duration     40
Name: day2, dtype: int64


In [192]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
df

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


#### Creating dataframes from external files

Data may not necessarily be available in the form of lists. Mostly, we will have to load the data stored in the form of a CSV file, text file, etc

pd.read_csv(filepath, sep=',', header='infer')

specify the following details:

- separator (by default ‘,’)
- header (takes the top row by default, if not specified)
- names (list of column name)

In [193]:
# Read a file using pandas
# reads first row as header by default

cars_df = pd.read_csv('cars.csv')
cars_df

Unnamed: 0,USCA,US,United States,809,FALSE
0,ASPAC,AUS,Australia,731.0,True
1,ASPAC,JAP,Japan,588.0,True
2,ASPAC,IN,India,18.0,True
3,ASPAC,RU,Russia,200.0,False
4,LATAM,MOR,Morocco,70.0,False
5,AFR,EG,Egypt,45.0,False
6,EUR,ENG,England,,True


In [194]:
# Read file - skip header
cars_df = pd.read_csv('cars.csv', header=None)
cars_df

Unnamed: 0,0,1,2,3,4
0,USCA,US,United States,809.0,False
1,ASPAC,AUS,Australia,731.0,True
2,ASPAC,JAP,Japan,588.0,True
3,ASPAC,IN,India,18.0,True
4,ASPAC,RU,Russia,200.0,False
5,LATAM,MOR,Morocco,70.0,False
6,AFR,EG,Egypt,45.0,False
7,EUR,ENG,England,,True


In [195]:
# Returns an array of headers

cars_df.columns

Int64Index([0, 1, 2, 3, 4], dtype='int64')

##### Assign Headers

In [196]:
# Rename Headers

cars_df.columns = ['country code', 'region', 'country', 'cars_per_cap', 'drive_right']
cars_df

Unnamed: 0,country code,region,country,cars_per_cap,drive_right
0,USCA,US,United States,809.0,False
1,ASPAC,AUS,Australia,731.0,True
2,ASPAC,JAP,Japan,588.0,True
3,ASPAC,IN,India,18.0,True
4,ASPAC,RU,Russia,200.0,False
5,LATAM,MOR,Morocco,70.0,False
6,AFR,EG,Egypt,45.0,False
7,EUR,ENG,England,,True


In [197]:
df = pd.read_csv("https://media-doselect.s3.amazonaws.com/generic/A08MajL8qN4rq72EpVJbAP1Rw/marks_1.csv", sep= "|", header=None)
print(df)

     0        1            2   3   4   5
0    1   Akshay  Mathematics  50  40  80
1    2   Mahima      English  40  33  83
2    3    Vikas  Mathematics  50  42  84
3    4  Abhinav      English  40  31  78
4    5   Mahima      Science  50  40  80
5    6   Akshay      Science  50  49  98
6    7  Abhinav  Mathematics  50  47  94
7    8    Vikas      Science  50  40  80
8    9  Abhinav      Science  50  47  94
9   10    Vikas      English  40  39  98
10  11   Akshay      English  40  35  88
11  12   Mahima  Mathematics  50  43  86


#### Assign first column as index

pd.read_csv(filepath, index_col = column_number)
to change the index while loading the data from a file

In [198]:
# Read file and set 1st column as index
# Set first column as index

cars_df = pd.read_csv("cars.csv", header= None, index_col=0)
cars_df

Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USCA,US,United States,809.0,False
ASPAC,AUS,Australia,731.0,True
ASPAC,JAP,Japan,588.0,True
ASPAC,IN,India,18.0,True
ASPAC,RU,Russia,200.0,False
LATAM,MOR,Morocco,70.0,False
AFR,EG,Egypt,45.0,False
EUR,ENG,England,,True


In [199]:
# set the column names
cars_df.columns = ['region', 'country', 'cars_per_cap', 'drive_right']
cars_df

Unnamed: 0_level_0,region,country,cars_per_cap,drive_right
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USCA,US,United States,809.0,False
ASPAC,AUS,Australia,731.0,True
ASPAC,JAP,Japan,588.0,True
ASPAC,IN,India,18.0,True
ASPAC,RU,Russia,200.0,False
LATAM,MOR,Morocco,70.0,False
AFR,EG,Egypt,45.0,False
EUR,ENG,England,,True


In [200]:
# Print the new index
cars_df.index

Index(['USCA', 'ASPAC', 'ASPAC', 'ASPAC', 'ASPAC', 'LATAM', 'AFR', 'EUR'], dtype='object', name=0)

#### Rename the Index Name

In [201]:
cars_df.index.name = 'country_code'
cars_df

Unnamed: 0_level_0,region,country,cars_per_cap,drive_right
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USCA,US,United States,809.0,False
ASPAC,AUS,Australia,731.0,True
ASPAC,JAP,Japan,588.0,True
ASPAC,IN,India,18.0,True
ASPAC,RU,Russia,200.0,False
LATAM,MOR,Morocco,70.0,False
AFR,EG,Egypt,45.0,False
EUR,ENG,England,,True


#### Delete the index name

In [202]:
cars_df.index.name = None
cars_df

Unnamed: 0,region,country,cars_per_cap,drive_right
USCA,US,United States,809.0,False
ASPAC,AUS,Australia,731.0,True
ASPAC,JAP,Japan,588.0,True
ASPAC,IN,India,18.0,True
ASPAC,RU,Russia,200.0,False
LATAM,MOR,Morocco,70.0,False
AFR,EG,Egypt,45.0,False
EUR,ENG,England,,True


#### Set Hierarchical index

create a multilevel indexing for your dataframe

In [203]:
# Read file and set 1st column as index
cars_df = pd.read_csv("cars.csv", header= None)

# set the column names
cars_df.columns = ['country_code','region','country','cars_per_cap','drives_right']

cars_df.set_index(['region', 'country_code'], inplace=True)
cars_df

Unnamed: 0_level_0,Unnamed: 1_level_0,country,cars_per_cap,drives_right
region,country_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
US,USCA,United States,809.0,False
AUS,ASPAC,Australia,731.0,True
JAP,ASPAC,Japan,588.0,True
IN,ASPAC,India,18.0,True
RU,ASPAC,Russia,200.0,False
MOR,LATAM,Morocco,70.0,False
EG,AFR,Egypt,45.0,False
ENG,EUR,England,,True


#### Write Data Frame to file

In [204]:
cars_df.to_csv('cars_to_csv.csv')

In [205]:
df = pd.read_csv("https://media-doselect.s3.amazonaws.com/generic/A08MajL8qN4rq72EpVJbAP1Rw/marks_1.csv", sep= "|", header=None, index_col=0)
df

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Akshay,Mathematics,50,40,80
2,Mahima,English,40,33,83
3,Vikas,Mathematics,50,42,84
4,Abhinav,English,40,31,78
5,Mahima,Science,50,40,80
6,Akshay,Science,50,49,98
7,Abhinav,Mathematics,50,47,94
8,Vikas,Science,50,40,80
9,Abhinav,Science,50,47,94
10,Vikas,English,40,39,98


In [206]:
df.columns= ["Name", "Subject", "Maximum Marks", "Marks Obtained", "Percentage"]
df

Unnamed: 0_level_0,Name,Subject,Maximum Marks,Marks Obtained,Percentage
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Akshay,Mathematics,50,40,80
2,Mahima,English,40,33,83
3,Vikas,Mathematics,50,42,84
4,Abhinav,English,40,31,78
5,Mahima,Science,50,40,80
6,Akshay,Science,50,49,98
7,Abhinav,Mathematics,50,47,94
8,Vikas,Science,50,40,80
9,Abhinav,Science,50,47,94
10,Vikas,English,40,39,98


In [207]:
df.index.name = "S.No."
print(df)

          Name      Subject  Maximum Marks  Marks Obtained  Percentage
S.No.                                                                 
1       Akshay  Mathematics             50              40          80
2       Mahima      English             40              33          83
3        Vikas  Mathematics             50              42          84
4      Abhinav      English             40              31          78
5       Mahima      Science             50              40          80
6       Akshay      Science             50              49          98
7      Abhinav  Mathematics             50              47          94
8        Vikas      Science             50              40          80
9      Abhinav      Science             50              47          94
10       Vikas      English             40              39          98
11      Akshay      English             40              35          88
12      Mahima  Mathematics             50              43          86


- dataframe_name.head(): the dataframes may hold large volumes of data, it would be an inefficient approach to load the entire data whenever an operation is performed. This method loads a limited number of entries.
- dataframe.info(): This method prints information about the dataframe, which includes the index data type and column data types, the count of non-null values and the memory used. 
- dataframe.describe(): This function produces descriptive statistics for the dataframe, that is, the central tendency (mean, median, min, max, etc.), dispersion, etc. It analyses the data and generates output for both numeric and non-numeric data types accordingly. 

In [208]:
sales = pd.read_excel('sales.xlsx')
sales

Unnamed: 0,Market,Region,No_of_Orders,Profit,Sales
0,Africa,Western Africa,251,-12901.51,78476.06
1,Africa,Southern Africa,85,11768.58,51319.5
2,Africa,North Africa,182,21643.08,86698.89
3,Africa,Eastern Africa,110,8013.04,44182.6
4,Africa,Central Africa,103,15606.3,61689.99
5,Asia Pacific,Western Asia,382,-16766.9,124312.24
6,Asia Pacific,Southern Asia,469,67998.76,351806.6
7,Asia Pacific,Southeastern Asia,533,20948.84,329751.38
8,Asia Pacific,Oceania,646,54734.02,408002.98
9,Asia Pacific,Eastern Asia,414,72805.1,315390.77


In [209]:
# Read file and set 1st two columns as index
# Set Hierarchical index


sales = pd.read_excel('sales.xlsx', index_col = [0,1])
sales

Unnamed: 0_level_0,Unnamed: 1_level_0,No_of_Orders,Profit,Sales
Market,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Western Africa,251,-12901.51,78476.06
Africa,Southern Africa,85,11768.58,51319.5
Africa,North Africa,182,21643.08,86698.89
Africa,Eastern Africa,110,8013.04,44182.6
Africa,Central Africa,103,15606.3,61689.99
Asia Pacific,Western Asia,382,-16766.9,124312.24
Asia Pacific,Southern Asia,469,67998.76,351806.6
Asia Pacific,Southeastern Asia,533,20948.84,329751.38
Asia Pacific,Oceania,646,54734.02,408002.98
Asia Pacific,Eastern Asia,414,72805.1,315390.77


#### Viewing the Data

In [210]:
# Default - returns top 5 rows
sales.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,No_of_Orders,Profit,Sales
Market,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,Western Africa,251,-12901.51,78476.06
Africa,Southern Africa,85,11768.58,51319.5
Africa,North Africa,182,21643.08,86698.89
Africa,Eastern Africa,110,8013.04,44182.6
Africa,Central Africa,103,15606.3,61689.99


In [211]:
# returns last 5 rows
sales.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,No_of_Orders,Profit,Sales
Market,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
USCA,Western US,490,44303.65,251991.83
USCA,Southern US,255,19991.83,148771.91
USCA,Eastern US,443,47462.04,264973.98
USCA,Central US,356,33697.43,170416.31
USCA,Canada,49,7246.62,26298.81


#### Info About the Data

In [212]:
# Display the information about the data stored in data frame
sales.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23 entries, ('Africa', 'Western Africa') to ('USCA', 'Canada')
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   No_of_Orders  23 non-null     int64  
 1   Profit        23 non-null     float64
 2   Sales         23 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 1.6+ KB


In [213]:
# Name of each column, with the data type
# Number of rows and columns
# Index information 
# Data column information
# Non-Null values present in each column
# how much memory your DataFrame is using

In [214]:
# Display the statistical information about the data in dataframe
sales.describe()

Unnamed: 0,No_of_Orders,Profit,Sales
count,23.0,23.0,23.0
mean,366.478261,28859.944783,206285.108696
std,246.590361,27701.193773,160589.886606
min,37.0,-16766.9,8190.74
25%,211.5,12073.085,82587.475
50%,356.0,20948.84,170416.31
75%,479.5,45882.845,290182.375
max,964.0,82091.27,656637.14


In [215]:
sales["Profit"]

Market        Region           
Africa        Western Africa      -12901.51
              Southern Africa      11768.58
              North Africa         21643.08
              Eastern Africa        8013.04
              Central Africa       15606.30
Asia Pacific  Western Asia        -16766.90
              Southern Asia        67998.76
              Southeastern Asia    20948.84
              Oceania              54734.02
              Eastern Asia         72805.10
              Central Asia         -2649.76
Europe        Western Europe       82091.27
              Southern Europe      18911.49
              Northern Europe      43237.44
              Eastern Europe       25050.69
LATAM         South America        12377.59
              Central America      74679.54
              Caribbean            13529.59
USCA          Western US           44303.65
              Southern US          19991.83
              Eastern US           47462.04
              Central US           33697.43


In [216]:
sales[["Profit"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Profit
Market,Region,Unnamed: 2_level_1
Africa,Western Africa,-12901.51
Africa,Southern Africa,11768.58
Africa,North Africa,21643.08
Africa,Eastern Africa,8013.04
Africa,Central Africa,15606.3
Asia Pacific,Western Asia,-16766.9
Asia Pacific,Southern Asia,67998.76
Asia Pacific,Southeastern Asia,20948.84
Asia Pacific,Oceania,54734.02
Asia Pacific,Eastern Asia,72805.1


In [217]:
# use to_string() to print the entire DataFrame

print(sales.to_string()) 

                                No_of_Orders    Profit      Sales
Market       Region                                              
Africa       Western Africa              251 -12901.51   78476.06
             Southern Africa              85  11768.58   51319.50
             North Africa                182  21643.08   86698.89
             Eastern Africa              110   8013.04   44182.60
             Central Africa              103  15606.30   61689.99
Asia Pacific Western Asia                382 -16766.90  124312.24
             Southern Asia               469  67998.76  351806.60
             Southeastern Asia           533  20948.84  329751.38
             Oceania                     646  54734.02  408002.98
             Eastern Asia                414  72805.10  315390.77
             Central Asia                 37  -2649.76    8190.74
Europe       Western Europe              964  82091.27  656637.14
             Southern Europe             338  18911.49  215703.93
          

In [218]:
df = pd.read_csv('https://query.data.world/s/vBDCsoHCytUSLKkLvq851k2b8JOCkF')
print(df.describe())

                X           Y        FFMC         DMC          DC         ISI  \
count  517.000000  517.000000  517.000000  517.000000  517.000000  517.000000   
mean     4.669246    4.299807   90.644681  110.872340  547.940039    9.021663   
std      2.313778    1.229900    5.520111   64.046482  248.066192    4.559477   
min      1.000000    2.000000   18.700000    1.100000    7.900000    0.000000   
25%      3.000000    4.000000   90.200000   68.600000  437.700000    6.500000   
50%      4.000000    4.000000   91.600000  108.300000  664.200000    8.400000   
75%      7.000000    5.000000   92.900000  142.400000  713.900000   10.800000   
max      9.000000    9.000000   96.200000  291.300000  860.600000   56.100000   

             temp          RH        wind        rain         area  
count  517.000000  517.000000  517.000000  517.000000   517.000000  
mean    18.889168   44.288201    4.017602    0.021663    12.847292  
std      5.806625   16.317469    1.791653    0.295959    63.655

In [219]:
print(df.columns)

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')


In [220]:
print(df.shape)

(517, 13)


#### max_rows

The number of rows returned is defined in Pandas option settings.

In [221]:
print(pd.options.display.max_rows) 

9999


If the DataFrame contains more than 60 rows, the print(df) statement will return only the headers and the first and last 5 rows.

In [222]:
# Increase the maximum number of rows to display the entire DataFrame

pd.options.display.max_rows = 9999

In [223]:
df = pd.read_csv('https://media-doselect.s3.amazonaws.com/generic/NMgEjwkAEGGQZBoNYGr9Ld7w0/rating.csv')
df

Unnamed: 0,ID,Department,Office,Rating
0,U2F26,Finance,New Delhi,3.4
1,U2M61,Marketing,New Delhi,3.9
2,U1S15,Sales,New Delhi,2.8
3,U1H87,HR,Mumbai,2.1
4,U1S51,Sales,New Delhi,4.6
5,U1H78,HR,New Delhi,3.8
6,U3M65,Marketing,New Delhi,5.0
7,U3M44,Marketing,Bangalore,2.1
8,U3S75,Sales,Bangalore,4.3
9,U2H84,HR,Bangalore,4.3


In [224]:
# Create a hierarchical index based on two columns: Office and Department
df = pd.read_csv('https://media-doselect.s3.amazonaws.com/generic/NMgEjwkAEGGQZBoNYGr9Ld7w0/rating.csv', index_col=[2,1])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Rating
Office,Department,Unnamed: 2_level_1,Unnamed: 3_level_1
New Delhi,Finance,U2F26,3.4
New Delhi,Marketing,U2M61,3.9
New Delhi,Sales,U1S15,2.8
Mumbai,HR,U1H87,2.1
New Delhi,Sales,U1S51,4.6
New Delhi,HR,U1H78,3.8
New Delhi,Marketing,U3M65,5.0
Bangalore,Marketing,U3M44,2.1
Bangalore,Sales,U3S75,4.3
Bangalore,HR,U2H84,4.3


In [225]:
# Print the first 5 rows as the output.
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Rating
Office,Department,Unnamed: 2_level_1,Unnamed: 3_level_1
New Delhi,Finance,U2F26,3.4
New Delhi,Marketing,U2M61,3.9
New Delhi,Sales,U1S15,2.8
Mumbai,HR,U1H87,2.1
New Delhi,Sales,U1S51,4.6


In [226]:
df[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Rating
Office,Department,Unnamed: 2_level_1,Unnamed: 3_level_1
New Delhi,Finance,U2F26,3.4
New Delhi,Marketing,U2M61,3.9
New Delhi,Sales,U1S15,2.8
Mumbai,HR,U1H87,2.1
New Delhi,Sales,U1S51,4.6
New Delhi,HR,U1H78,3.8
New Delhi,Marketing,U3M65,5.0
Bangalore,Marketing,U3M44,2.1
Bangalore,Sales,U3S75,4.3
Bangalore,HR,U2H84,4.3


### Indexing

- Select rows from a dataframe
- Select columns from a dataframe
- Select subsets of dataframes

df[start_index:end_index] will subset the rows according to the start and end indices. 

<div class="alert alert-block alert-success">
<b>NOTE</b> 
    
- df['column'] or df.column: It returns a series
- df[['col_x', 'col_y']]: It returns a dataframe
    
</div>

In [227]:
sales = pd.read_excel('sales.xlsx', index_col = [1])
sales

Unnamed: 0_level_0,Market,No_of_Orders,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western Africa,Africa,251,-12901.51,78476.06
Southern Africa,Africa,85,11768.58,51319.5
North Africa,Africa,182,21643.08,86698.89
Eastern Africa,Africa,110,8013.04,44182.6
Central Africa,Africa,103,15606.3,61689.99
Western Asia,Asia Pacific,382,-16766.9,124312.24
Southern Asia,Asia Pacific,469,67998.76,351806.6
Southeastern Asia,Asia Pacific,533,20948.84,329751.38
Oceania,Asia Pacific,646,54734.02,408002.98
Eastern Asia,Asia Pacific,414,72805.1,315390.77


#### Column Indexing

In [228]:
sales["Sales"]

Region
Western Africa        78476.06
Southern Africa       51319.50
North Africa          86698.89
Eastern Africa        44182.60
Central Africa        61689.99
Western Asia         124312.24
Southern Asia        351806.60
Southeastern Asia    329751.38
Oceania              408002.98
Eastern Asia         315390.77
Central Asia           8190.74
Western Europe       656637.14
Southern Europe      215703.93
Northern Europe      252969.09
Eastern Europe       108258.93
South America        210710.49
Central America      461670.28
Caribbean            116333.05
Western US           251991.83
Southern US          148771.91
Eastern US           264973.98
Central US           170416.31
Canada                26298.81
Name: Sales, dtype: float64

In [229]:
sales.Sales

Region
Western Africa        78476.06
Southern Africa       51319.50
North Africa          86698.89
Eastern Africa        44182.60
Central Africa        61689.99
Western Asia         124312.24
Southern Asia        351806.60
Southeastern Asia    329751.38
Oceania              408002.98
Eastern Asia         315390.77
Central Asia           8190.74
Western Europe       656637.14
Southern Europe      215703.93
Northern Europe      252969.09
Eastern Europe       108258.93
South America        210710.49
Central America      461670.28
Caribbean            116333.05
Western US           251991.83
Southern US          148771.91
Eastern US           264973.98
Central US           170416.31
Canada                26298.81
Name: Sales, dtype: float64

In [230]:
type(sales["Sales"])

pandas.core.series.Series

In [231]:
# Display Sales and Profit Column together

sales[["Sales", "Profit"]]

Unnamed: 0_level_0,Sales,Profit
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Western Africa,78476.06,-12901.51
Southern Africa,51319.5,11768.58
North Africa,86698.89,21643.08
Eastern Africa,44182.6,8013.04
Central Africa,61689.99,15606.3
Western Asia,124312.24,-16766.9
Southern Asia,351806.6,67998.76
Southeastern Asia,329751.38,20948.84
Oceania,408002.98,54734.02
Eastern Asia,315390.77,72805.1


In [232]:
# Print out the columns 'month', 'day', 'temp', 'area' from the dataframe 'df'.

df = pd.read_csv('https://query.data.world/s/vBDCsoHCytUSLKkLvq851k2b8JOCkF')
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [233]:
df[['month', 'day', 'temp', 'area']].head()

Unnamed: 0,month,day,temp,area
0,mar,fri,8.2,0.0
1,oct,tue,18.0,0.0
2,oct,sat,14.6,0.0
3,mar,fri,8.3,0.0
4,mar,sun,11.4,0.0


In [234]:
sales.head(10)

Unnamed: 0_level_0,Market,No_of_Orders,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western Africa,Africa,251,-12901.51,78476.06
Southern Africa,Africa,85,11768.58,51319.5
North Africa,Africa,182,21643.08,86698.89
Eastern Africa,Africa,110,8013.04,44182.6
Central Africa,Africa,103,15606.3,61689.99
Western Asia,Asia Pacific,382,-16766.9,124312.24
Southern Asia,Asia Pacific,469,67998.76,351806.6
Southeastern Asia,Asia Pacific,533,20948.84,329751.38
Oceania,Asia Pacific,646,54734.02,408002.98
Eastern Asia,Asia Pacific,414,72805.1,315390.77


#### Row Indexing

In [235]:
# loc accessor takes row index and column index
# display data at "Southern Asia" index

sales.loc["Southern Asia"]

Market          Asia Pacific
No_of_Orders             469
Profit              67998.76
Sales               351806.6
Name: Southern Asia, dtype: object

In [236]:
# display "Sales" data at "Southern Asia" index

sales.loc["Southern Asia", "Sales"]

351806.6

In [238]:
# iloc accessor takes row number and column number
sales.iloc[6]

Market          Asia Pacific
No_of_Orders             469
Profit              67998.76
Sales               351806.6
Name: Southern Asia, dtype: object

In [239]:
sales.iloc[6,3]

351806.6

#### Selection by Label: loc function

In [240]:
# label-based indexing
# loc method to extract rows and columns from a dataframe based on the  labels
# dataframe.loc[[list_of_row_labels], [list_of_column_labels]]

In [241]:
sales.loc["Southern Asia", "Sales"]

351806.6

#### Selection by integer location: iloc function

In [242]:
# position-based indexing
# row or column number instead of labels
# dataframe.iloc[rows, columns]

In [243]:
sales.iloc[6,-1]

351806.6

### Slicing

In [244]:
# All rows, Profit and Sales columns

sales.loc[: , ["Profit" , "Sales"]].head()

Unnamed: 0_level_0,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Western Africa,-12901.51,78476.06
Southern Africa,11768.58,51319.5
North Africa,21643.08,86698.89
Eastern Africa,8013.04,44182.6
Central Africa,15606.3,61689.99


In [245]:
sales.iloc[: , [2,3]].head()

Unnamed: 0_level_0,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Western Africa,-12901.51,78476.06
Southern Africa,11768.58,51319.5
North Africa,21643.08,86698.89
Eastern Africa,8013.04,44182.6
Central Africa,15606.3,61689.99


In [246]:
# Display data for Western Africa Southern Africa and North Africa

In [247]:
sales.loc[["Western Africa", "Southern Africa", "North Africa"]]

Unnamed: 0_level_0,Market,No_of_Orders,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western Africa,Africa,251,-12901.51,78476.06
Southern Africa,Africa,85,11768.58,51319.5
North Africa,Africa,182,21643.08,86698.89


In [248]:
sales.iloc[0:3 , :]

Unnamed: 0_level_0,Market,No_of_Orders,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western Africa,Africa,251,-12901.51,78476.06
Southern Africa,Africa,85,11768.58,51319.5
North Africa,Africa,182,21643.08,86698.89


In [249]:
# Display Sales and Profit data for Western Africa Southern Africa and North Africa

In [250]:
sales.loc[["Western Africa", "Southern Africa", "North Africa"] , ["Sales", "Profit"]]

Unnamed: 0_level_0,Sales,Profit
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Western Africa,78476.06,-12901.51
Southern Africa,51319.5,11768.58
North Africa,86698.89,21643.08


In [251]:
sales.iloc[0:3 , -2:]

Unnamed: 0_level_0,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Western Africa,-12901.51,78476.06
Southern Africa,11768.58,51319.5
North Africa,21643.08,86698.89


In [252]:
# Print only the even numbers of rows of the dataframe 'df'.

In [253]:
df = pd.read_csv('https://query.data.world/s/vBDCsoHCytUSLKkLvq851k2b8JOCkF')
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [254]:
df.iloc[2: :2 , ].head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0
6,8,6,aug,mon,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0
8,8,6,sep,tue,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0
10,7,5,sep,sat,92.5,88.0,698.6,7.1,17.8,51,7.2,0.0,0.0


### Subsetting rows based on conditions

- isin() : Similar to the membership operator in lists, this function can check if the given element "is in" the collection of elements provided. 
- isna() : It checks whether the given element is null/empty. 

In [255]:
sales.head()

Unnamed: 0_level_0,Market,No_of_Orders,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western Africa,Africa,251,-12901.51,78476.06
Southern Africa,Africa,85,11768.58,51319.5
North Africa,Africa,182,21643.08,86698.89
Eastern Africa,Africa,110,8013.04,44182.6
Central Africa,Africa,103,15606.3,61689.99


In [256]:
# Display the LATAM and Eruopean countries with sales > 250000

sales[ (sales["Market"].isin(["LATAM", "Europe"]))  & (sales["Sales"] > 250000)]

Unnamed: 0_level_0,Market,No_of_Orders,Profit,Sales
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Western Europe,Europe,964,82091.27,656637.14
Northern Europe,Europe,367,43237.44,252969.09
Central America,LATAM,930,74679.54,461670.28


In [257]:
# Print all the columns and the rows where 'area' is greater than 0, 'wind' is greater than 1 and the 'temp' is greater than 15.

In [258]:
df = pd.read_csv('https://query.data.world/s/vBDCsoHCytUSLKkLvq851k2b8JOCkF')
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [259]:
df[ (df["area"] > 0) & (df["wind"] > 1) & (df["temp"] > 15)]. head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
138,9,9,jul,tue,85.8,48.3,313.4,3.9,18.0,42,2.7,0.0,0.36
139,1,4,sep,tue,91.0,129.5,692.6,7.0,21.7,38,2.2,0.0,0.43
140,2,5,sep,mon,90.9,126.5,686.5,7.0,21.9,39,1.8,0.0,0.47
141,1,2,aug,wed,95.5,99.9,513.3,13.2,23.3,31,4.5,0.0,0.55
142,8,6,aug,fri,90.1,108.0,529.8,12.5,21.2,51,8.9,0.0,0.61


### Function Application

- Table wise Function Application: pipe()
- Row or Column Wise Function Application: apply()
- Element wise Function Application: applymap()

https://www.tutorialspoint.com/python_pandas/python_pandas_function_application.htm

In [296]:
import numpy as np
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df.apply(np.mean)

col1    0.518931
col2    0.674032
col3   -0.145885
dtype: float64

In [297]:
df = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
df.apply(np.mean,axis=1)

0    0.019079
1   -0.190633
2    0.697530
3   -0.170316
4   -0.575338
dtype: float64

In [298]:
temp = df.apply(lambda x: x.max() - x.min())

In [299]:
type(temp)

pandas.core.series.Series

In [263]:
df=pd.DataFrame(np.arange(0,20).reshape(5,4),index=['Row1','Row2','Row3','Row4','Row5'],columns=["Column1","Column2","Column3","Coumn4"])

In [264]:
df

Unnamed: 0,Column1,Column2,Column3,Coumn4
Row1,0,1,2,3
Row2,4,5,6,7
Row3,8,9,10,11
Row4,12,13,14,15
Row5,16,17,18,19


In [265]:
type(df.loc["Row1"])

pandas.core.series.Series

In [266]:
df.iloc[0:2 , 0:2]

Unnamed: 0,Column1,Column2
Row1,0,1
Row2,4,5


In [267]:
type(df.iloc[0:2 , 0:2])

pandas.core.frame.DataFrame

In [268]:
df.iloc[0:2 , 0]

Row1    0
Row2    4
Name: Column1, dtype: int32

In [269]:
type(df.iloc[0:2 , 0])

pandas.core.series.Series

You are provided with the dataset of a company which has offices across three cities - Mumbai, Bangalore and New Delhi. The dataset contains the rating (out of 5) of all the employees from different departments (Finance, HR, Marketing and Sales). 



The company has come up with a new policy that any individual with a rating equal to or below 3.5 needs to attend a training. Using dataframes, load the dataset and then derive the column ‘Training’ which shows ‘Yes’ for people who require training and ‘No’ for those who do not.

In [270]:
df = pd.read_csv('https://media-doselect.s3.amazonaws.com/generic/NMgEjwkAEGGQZBoNYGr9Ld7w0/rating.csv')
df.head()

Unnamed: 0,ID,Department,Office,Rating
0,U2F26,Finance,New Delhi,3.4
1,U2M61,Marketing,New Delhi,3.9
2,U1S15,Sales,New Delhi,2.8
3,U1H87,HR,Mumbai,2.1
4,U1S51,Sales,New Delhi,4.6


In [271]:
df["Training"] = df["Rating"].apply(lambda x: "Yes" if x<=3.5 else "No")
df.head()

Unnamed: 0,ID,Department,Office,Rating,Training
0,U2F26,Finance,New Delhi,3.4,Yes
1,U2M61,Marketing,New Delhi,3.9,No
2,U1S15,Sales,New Delhi,2.8,Yes
3,U1H87,HR,Mumbai,2.1,Yes
4,U1S51,Sales,New Delhi,4.6,No


In [272]:
df.Department.value_counts()

Marketing    138
Finance      134
HR           131
Sales        130
Name: Department, dtype: int64

In [273]:
# Most efficient team

for i in ['Finance', 'HR', 'Sales', 'Marketing']:
    print(i, len(df[(df['Training'] == 'No') & (df['Department'] == i)]) / len(df[df['Department'] == i]) * 100)

Finance 50.0
HR 57.25190839694656
Sales 49.23076923076923
Marketing 46.3768115942029


In [274]:
# Group the dataframe 'df' by 'month' and 'day' and find the mean value for column 'rain' and 'wind'.

df = pd.read_csv('https://query.data.world/s/vBDCsoHCytUSLKkLvq851k2b8JOCkF')
groupby_df = df.groupby(by =["month","day"]).mean()

In [275]:
groupby_df[["rain", "wind"]].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rain,wind
month,day,Unnamed: 2_level_1,Unnamed: 3_level_1
apr,fri,0.0,3.1
apr,mon,0.0,3.1
apr,sat,0.0,4.5
apr,sun,0.0,5.666667
apr,thu,0.0,5.8


In [276]:
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [277]:
# Apply Aggregation on a Single Column of a Dataframe

df['temp'].aggregate(np.sum)

9765.699999999999

In [278]:
# Apply Aggregation on Multiple Columns of a DataFrame

df[['temp', 'RH']].aggregate(np.sum)

temp     9765.7
RH      22897.0
dtype: float64

In [279]:
# Apply Multiple Functions on a Single Column of a DataFrame

df['temp'].aggregate([np.sum, np.mean])

sum     9765.700000
mean      18.889168
Name: temp, dtype: float64

In [280]:
# Apply Multiple Functions on Multiple Columns of a DataFrame

df[['temp', 'RH']].aggregate([np.sum, np.mean])

Unnamed: 0,temp,RH
sum,9765.7,22897.0
mean,18.889168,44.288201


In [281]:
# Apply Different Functions to Different Columns of a Dataframe

df.aggregate({"temp": np.sum,"RH": np.mean})

temp    9765.700000
RH        44.288201
dtype: float64

### GroupBy

In [301]:
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
   'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
   'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
   'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)

In [302]:
df.head()

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741


Split Data into Groups


Pandas object can be split into any of their objects. There are multiple ways to split an object like −

- obj.groupby('key')
- obj.groupby(['key1','key2'])
- obj.groupby(key,axis=1)

In [303]:
# View Groups
df.groupby('Team').groups

{'Devils': [2, 3], 'Kings': [4, 6, 7], 'Riders': [0, 1, 8, 11], 'Royals': [9, 10], 'kings': [5]}

In [304]:
df.groupby(['Team','Year']).groups

{('Devils', 2014): [2], ('Devils', 2015): [3], ('Kings', 2014): [4], ('Kings', 2016): [6], ('Kings', 2017): [7], ('Riders', 2014): [0], ('Riders', 2015): [1], ('Riders', 2016): [8], ('Riders', 2017): [11], ('Royals', 2014): [9], ('Royals', 2015): [10], ('kings', 2015): [5]}

In [305]:
# get_group() method -> select a single group

df.groupby('Year').get_group(2014)

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
2,Devils,2,2014,863
4,Kings,3,2014,741
9,Royals,4,2014,701


In [306]:
df.groupby('Year')['Points'].agg(np.mean)

Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64

In [307]:
df.groupby('Team')['Points'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,1536,768.0,134.350288
Kings,2285,761.666667,24.006943
Riders,3049,762.25,88.567771
Royals,1505,752.5,72.831998
kings,812,812.0,


In [309]:
df.head()

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741


In [308]:
df.groupby('Team').groups

{'Devils': [2, 3], 'Kings': [4, 6, 7], 'Riders': [0, 1, 8, 11], 'Royals': [9, 10], 'kings': [5]}

In [289]:
# return the teams which have participated three or more times in IPL

df.groupby('Team').filter(lambda x: len(x) >= 3)

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
4,Kings,3,2014,741
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
11,Riders,2,2017,690


In [290]:
grouped = df.groupby('Team')
score = lambda x: (x - x.mean()) / x.std()*10
grouped.transform(score)

Unnamed: 0,Rank,Year,Points
0,-15.0,-11.61895,12.843272
1,5.0,-3.872983,3.020286
2,-7.071068,-7.071068,7.071068
3,7.071068,7.071068,-7.071068
4,11.547005,-10.910895,-8.608621
5,,,
6,-5.773503,2.182179,-2.360428
7,-5.773503,8.728716,10.969049
8,5.0,3.872983,-7.705963
9,7.071068,-7.071068,-7.071068


### Reindexing

Reindexing changes the row labels and column labels of a DataFrame. To reindex means to conform the data to match a given set of labels along a particular axis.

In [311]:
N=20

df = pd.DataFrame({
   'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
   'x': np.linspace(0,stop=N-1,num=N),
   'y': np.random.rand(N),
   'C': np.random.choice(['Low','Medium','High'],N).tolist(),
   'D': np.random.normal(100, 10, size=(N)).tolist()
})

In [312]:
df.head()

Unnamed: 0,A,x,y,C,D
0,2016-01-01,0.0,0.412513,High,95.720793
1,2016-01-02,1.0,0.959497,Low,117.481701
2,2016-01-03,2.0,0.578357,Low,103.980667
3,2016-01-04,3.0,0.666903,Medium,107.104719
4,2016-01-05,4.0,0.075866,High,97.281215


In [314]:
df_reindexed = df.reindex(index=[0,2,5], columns=['A', 'C', 'B'])
df_reindexed.head()

Unnamed: 0,A,C,B
0,2016-01-01,High,
2,2016-01-03,Low,
5,2016-01-06,High,


In [315]:
# the df1 DataFrame is altered and reindexed like df2. 
# The column names should be matched or else NAN will be added for the entire column label.

df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])

df1 = df1.reindex_like(df2)

In [316]:
df1

Unnamed: 0,col1,col2,col3
0,0.548546,0.276981,1.08461
1,0.510689,0.708925,0.797318
2,-0.214159,0.569153,1.076981
3,-0.651547,1.392989,-0.479085
4,-1.594416,-1.37682,-0.552338
5,0.530467,-1.671947,0.848765
6,0.620069,1.062799,-0.364201


In [318]:
df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(2,3),columns=['col1','col2','col3'])
df2

Unnamed: 0,col1,col2,col3
0,0.238447,-0.097355,-0.617779
1,-0.189583,-0.169457,0.298964


In [320]:
df1

Unnamed: 0,col1,col2,col3
0,0.857821,1.730052,1.164488
1,-1.302571,-0.536478,0.040074
2,1.43663,0.724463,-0.93022
3,2.107922,-1.214441,1.968762
4,-0.449474,-0.361504,0.760734
5,0.234362,0.973634,0.109009


In [319]:
df2.reindex_like(df1)

Unnamed: 0,col1,col2,col3
0,0.238447,-0.097355,-0.617779
1,-0.189583,-0.169457,0.298964
2,,,
3,,,
4,,,
5,,,


In [321]:
# Now Fill the NAN's with preceding Values
print ("Data Frame with Forward Fill:")
df2.reindex_like(df1,method='ffill')

Data Frame with Forward Fill:


Unnamed: 0,col1,col2,col3
0,0.238447,-0.097355,-0.617779
1,-0.189583,-0.169457,0.298964
2,-0.189583,-0.169457,0.298964
3,-0.189583,-0.169457,0.298964
4,-0.189583,-0.169457,0.298964
5,-0.189583,-0.169457,0.298964


In [322]:
# Limit specifies the maximum count of consecutive matches. 
# Now Fill the NAN's with preceding Values
# only the 2nd row is filled by the preceding 1st row. Then, the rows are left as they are.

print ("Data Frame with Forward Fill limiting to 1:")
df2.reindex_like(df1,method='ffill',limit=1)

Data Frame with Forward Fill limiting to 1:


Unnamed: 0,col1,col2,col3
0,0.238447,-0.097355,-0.617779
1,-0.189583,-0.169457,0.298964
2,-0.189583,-0.169457,0.298964
3,,,
4,,,
5,,,


### Renaming

In [329]:
df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
df1

Unnamed: 0,col1,col2,col3
0,-1.081591,0.114238,-0.632456
1,0.360762,0.065571,-2.074396
2,0.074566,0.081261,-0.273432
3,0.618863,0.140536,0.66669
4,-0.21295,1.164272,-1.063245
5,1.249912,-1.129504,-0.89848


In [331]:
df1.rename(columns={'col1' : 'c1', 'col2' : 'c2'},index = {0 : 'apple', 1 : 'banana', 2 : 'durian'}, inplace=True)

In [332]:
df1

Unnamed: 0,c1,c2,col3
apple,-1.081591,0.114238,-0.632456
banana,0.360762,0.065571,-2.074396
durian,0.074566,0.081261,-0.273432
3,0.618863,0.140536,0.66669
4,-0.21295,1.164272,-1.063245
5,1.249912,-1.129504,-0.89848


### Sorting

In [335]:
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])

In [336]:
unsorted_df

Unnamed: 0,col2,col1
1,1.061745,-0.344866
4,-0.847985,-0.235025
6,0.229354,0.809589
2,-1.21128,-0.887225
3,-0.240628,-0.255975
5,-1.620696,0.223087
9,0.876784,-1.654611
8,-0.348569,-0.079838
0,0.770913,-0.72705
7,-0.665182,0.087365


#### By label

Using the sort_index() method, by passing the axis arguments and the order of sorting, DataFrame can be sorted. By default, sorting is done on row labels in ascending order.

In [337]:
sorted_df=unsorted_df.sort_index()
sorted_df

Unnamed: 0,col2,col1
0,0.770913,-0.72705
1,1.061745,-0.344866
2,-1.21128,-0.887225
3,-0.240628,-0.255975
4,-0.847985,-0.235025
5,-1.620696,0.223087
6,0.229354,0.809589
7,-0.665182,0.087365
8,-0.348569,-0.079838
9,0.876784,-1.654611


In [338]:
sorted_df = unsorted_df.sort_index(ascending=False)
sorted_df

Unnamed: 0,col2,col1
9,0.876784,-1.654611
8,-0.348569,-0.079838
7,-0.665182,0.087365
6,0.229354,0.809589
5,-1.620696,0.223087
4,-0.847985,-0.235025
3,-0.240628,-0.255975
2,-1.21128,-0.887225
1,1.061745,-0.344866
0,0.770913,-0.72705


By passing the axis argument with a value 0 or 1, the sorting can be done on the column labels. By default, axis=0, sort by row.

In [342]:
unsorted_df = pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns = ['col2','col1'])
sorted_df=unsorted_df.sort_index(axis=0)
sorted_df

Unnamed: 0,col2,col1
0,-0.195912,-0.004858
1,-1.217805,0.540311
2,0.258384,1.123267
3,-1.504843,1.86121
4,-1.310063,-0.443032
5,1.111717,0.301951
6,0.36021,-0.071769
7,-0.130248,-0.561206
8,-1.148398,1.030422
9,-0.061878,-1.026474


#### By Value

'by' argument takes a list of column values

In [343]:
unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
sorted_df = unsorted_df.sort_values(by='col1')
sorted_df

Unnamed: 0,col1,col2
1,1,3
2,1,2
3,1,4
0,2,1


In [344]:
unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
sorted_df = unsorted_df.sort_values(by=['col1','col2'])
sorted_df

Unnamed: 0,col1,col2
2,1,2
1,1,3
3,1,4
0,2,1


#### Sorting Algorithm

sort_values() provides a provision to choose the algorithm from mergesort, heapsort and quicksort. <b>Mergesort</b> is the only stable algorithm.

In [346]:
unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
sorted_df = unsorted_df.sort_values(by='col1' ,kind='mergesort')
sorted_df

Unnamed: 0,col1,col2
1,1,3
2,1,2
3,1,4
0,2,1


### Missing Data

NaN means Not a Number.

In [347]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,-1.598605,-0.453441,-0.868623
b,,,
c,-0.89689,-2.445449,-0.581742
d,,,
e,-1.135431,-0.619777,1.469648
f,-0.619504,-0.120285,-0.428429
g,,,
h,0.089201,0.730162,1.539979


In [348]:
df['one'].isnull()

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [349]:
df['one'].isnull().sum()

3

In [350]:
df['one'].notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

#### Calculations with Missing Data
- When summing data, NA will be treated as Zero
- If the data are all NA, then the result will be 0

In [353]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,1.342219,-0.313041,-1.421305
b,,,
c,0.170004,-0.229328,-0.166713
d,,,
e,0.86313,-1.384935,-0.003466
f,1.522088,-0.886376,-1.243997
g,,,
h,-1.240876,1.642807,0.256493


In [354]:
df['one'].sum()

2.6565651034260913

In [355]:
df = pd.DataFrame(index=[0,1,2,3,4,5],columns=['one','two'])
df

Unnamed: 0,one,two
0,,
1,,
2,,
3,,
4,,
5,,


In [356]:
df['one'].sum()

0

#### Cleaning / Filling Missing Data

In [387]:
# Replace NaN with a Scalar Value

df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one','two', 'three'])
df = df.reindex(['a', 'b', 'c'])
df

Unnamed: 0,one,two,three
a,-0.363998,-0.384976,0.21056
b,,,
c,-0.281227,-1.680371,-1.648155


In [388]:
# dealing with empty cells -> insert a new value instead
# The fillna() method allows us to replace empty cells with a value
df.fillna(0)

Unnamed: 0,one,two,three
a,-0.363998,-0.384976,0.21056
b,0.0,0.0,0.0
c,-0.281227,-1.680371,-1.648155


In [389]:
df["one"].fillna(130, inplace = True)
df

Unnamed: 0,one,two,three
a,-0.363998,-0.384976,0.21056
b,130.0,,
c,-0.281227,-1.680371,-1.648155


In [390]:
df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one','two', 'three'])
df = df.reindex(['a', 'b', 'c'])
df

Unnamed: 0,one,two,three
a,0.244682,-0.591482,1.674011
b,,,
c,-1.568754,-0.518786,0.147924


In [393]:
# Replace Using Mean, Median, or Mode

x= df["one"].mean()
df["one"].fillna(x, inplace = True)
df

Unnamed: 0,one,two,three
a,0.244682,-0.591482,1.674011
b,-0.662036,,
c,-1.568754,-0.518786,0.147924


In [395]:
x= df["two"].median()
df["two"].fillna(x, inplace = True)
df

Unnamed: 0,one,two,three
a,0.244682,-0.591482,1.674011
b,-0.662036,-0.555134,
c,-1.568754,-0.518786,0.147924


In [398]:
x= df["three"].mode()[0]
df["three"].fillna(x, inplace = True)
df

Unnamed: 0,one,two,three
a,0.244682,-0.591482,1.674011
b,-0.662036,-0.555134,0.147924
c,-1.568754,-0.518786,0.147924


In [400]:
df["three"].mode()

0    0.147924
dtype: float64

In [401]:
df["three"].mode()[0]

0.14792387626802458

In [359]:
# Drop Missing Values

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,0.821219,0.166081,0.949285
b,,,
c,-1.388269,-0.309105,2.100169
d,,,
e,-1.099827,1.085305,-1.004876
f,1.425172,-0.507467,-0.48381
g,,,
h,-2.408526,-1.054944,-1.739009


In [360]:
# By default, the dropna() method returns a new DataFrame, and will not change the original.
# If you want to change the original DataFrame, use the inplace = True argument
df.dropna()

Unnamed: 0,one,two,three
a,0.821219,0.166081,0.949285
c,-1.388269,-0.309105,2.100169
e,-1.099827,1.085305,-1.004876
f,1.425172,-0.507467,-0.48381
h,-2.408526,-1.054944,-1.739009


In [361]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df

Unnamed: 0,one,two,three
a,0.22008,0.346756,0.310301
b,,,
c,0.269169,0.54551,-0.199716
d,,,
e,1.317899,-0.027829,-0.322301
f,-1.503127,-0.425026,-0.872601
g,,,
h,-0.848553,0.710432,-0.348109


In [362]:
df.dropna(axis=1)

a
b
c
d
e
f
g
h


In [411]:
# Replace Missing (or) Generic Values

df = pd.DataFrame({'one':[10,10,30,40,50,2000], 'two':[1000,1000,30,40,50,60]})
df

Unnamed: 0,one,two
0,10,1000
1,10,1000
2,30,30
3,40,40
4,50,50
5,2000,60


In [412]:
df.replace({1000:10,2000:60})

Unnamed: 0,one,two
0,10,10
1,10,10
2,30,30
3,40,40
4,50,50
5,60,60


#### Removing Duplicates

The duplicated() method returns a Boolean values for each row. Returns True for every row that is a duplicate, othwerwise False

In [413]:
df.duplicated()

0    False
1     True
2    False
3    False
4    False
5    False
dtype: bool

In [414]:
df.drop_duplicates(inplace = True)
df

Unnamed: 0,one,two
0,10,1000
2,30,30
3,40,40
4,50,50
5,2000,60


###  Merging DataFrames

Merge and append are two of the most common operations that are performed in data analysis.

pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,left_index=False, right_index=False, sort=True)

- left − A DataFrame object.
- right − Another DataFrame object.
- on − Columns (names) to join on. Must be found in both the left and right DataFrame objects.
- left_on − Columns from the left DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame.
- right_on − Columns from the right DataFrame to use as keys. Can either be column names or arrays with length equal to the length of the DataFrame.
- left_index − If True, use the index (row labels) from the left DataFrame as its join key(s). In case of a DataFrame with a MultiIndex (hierarchical), the number of levels must match the number of join keys from the right DataFrame.
- right_index − Same usage as left_index for the right DataFrame.
- how − One of 'left', 'right', 'outer', 'inner'. Defaults to inner. Each method has been described below.
- sort − Sort the result DataFrame by the join keys in lexicographical order. Defaults to True, setting to False will improve the performance substantially in many cases.

In [366]:
left = pd.DataFrame({
   'id':[1,2,3,4,5],
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
   {'id':[1,2,3,4,5],
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5']})

In [367]:
left

Unnamed: 0,id,Name,subject_id
0,1,Alex,sub1
1,2,Amy,sub2
2,3,Allen,sub4
3,4,Alice,sub6
4,5,Ayoung,sub5


In [368]:
right

Unnamed: 0,id,Name,subject_id
0,1,Billy,sub2
1,2,Brian,sub4
2,3,Bran,sub3
3,4,Bryce,sub6
4,5,Betty,sub5


In [369]:
pd.merge(left,right,on='id')

Unnamed: 0,id,Name_x,subject_id_x,Name_y,subject_id_y
0,1,Alex,sub1,Billy,sub2
1,2,Amy,sub2,Brian,sub4
2,3,Allen,sub4,Bran,sub3
3,4,Alice,sub6,Bryce,sub6
4,5,Ayoung,sub5,Betty,sub5


In [370]:
pd.merge(left,right,on=['id','subject_id'])

Unnamed: 0,id,Name_x,subject_id,Name_y
0,4,Alice,sub6,Bryce
1,5,Ayoung,sub5,Betty


- left  : LEFT OUTER JOIN  : Use keys from left object
- right : RIGHT OUTER JOIN : Use keys from right object
- outer : FULL OUTER JOIN  : Use union of keys
- inner : INNER JOIN       : Use intersection of keys

In [371]:
pd.merge(left, right, on='subject_id', how='left')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,1,Alex,sub1,,
1,2,Amy,sub2,1.0,Billy
2,3,Allen,sub4,2.0,Brian
3,4,Alice,sub6,4.0,Bryce
4,5,Ayoung,sub5,5.0,Betty


In [372]:
pd.merge(left, right, on='subject_id', how='right')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,2.0,Amy,sub2,1,Billy
1,3.0,Allen,sub4,2,Brian
2,,,sub3,3,Bran
3,4.0,Alice,sub6,4,Bryce
4,5.0,Ayoung,sub5,5,Betty


In [373]:
pd.merge(left, right, how='outer', on='subject_id')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,1.0,Alex,sub1,,
1,2.0,Amy,sub2,1.0,Billy
2,3.0,Allen,sub4,2.0,Brian
3,4.0,Alice,sub6,4.0,Bryce
4,5.0,Ayoung,sub5,5.0,Betty
5,,,sub3,3.0,Bran


In [374]:
pd.merge(left, right, on='subject_id', how='inner')

Unnamed: 0,id_x,Name_x,subject_id,id_y,Name_y
0,2,Amy,sub2,1,Billy
1,3,Allen,sub4,2,Brian
2,4,Alice,sub6,4,Bryce
3,5,Ayoung,sub5,5,Betty


In [375]:
one = pd.DataFrame({
   'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
   'subject_id':['sub1','sub2','sub4','sub6','sub5'],
   'Marks_scored':[98,90,87,69,78]},
   index=[1,2,3,4,5])

two = pd.DataFrame({
   'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
   'subject_id':['sub2','sub4','sub3','sub6','sub5'],
   'Marks_scored':[89,80,79,97,88]},
   index=[1,2,3,4,5])

In [376]:
one

Unnamed: 0,Name,subject_id,Marks_scored
1,Alex,sub1,98
2,Amy,sub2,90
3,Allen,sub4,87
4,Alice,sub6,69
5,Ayoung,sub5,78


In [377]:
two

Unnamed: 0,Name,subject_id,Marks_scored
1,Billy,sub2,89
2,Brian,sub4,80
3,Bran,sub3,79
4,Bryce,sub6,97
5,Betty,sub5,88


In [378]:
pd.concat([one,two])

Unnamed: 0,Name,subject_id,Marks_scored
1,Alex,sub1,98
2,Amy,sub2,90
3,Allen,sub4,87
4,Alice,sub6,69
5,Ayoung,sub5,78
1,Billy,sub2,89
2,Brian,sub4,80
3,Bran,sub3,79
4,Bryce,sub6,97
5,Betty,sub5,88


In [379]:
pd.concat([one,two],keys=['x','y'])

Unnamed: 0,Unnamed: 1,Name,subject_id,Marks_scored
x,1,Alex,sub1,98
x,2,Amy,sub2,90
x,3,Allen,sub4,87
x,4,Alice,sub6,69
x,5,Ayoung,sub5,78
y,1,Billy,sub2,89
y,2,Brian,sub4,80
y,3,Bran,sub3,79
y,4,Bryce,sub6,97
y,5,Betty,sub5,88


In [380]:
pd.concat([one,two],keys=['x','y'],ignore_index=True)

Unnamed: 0,Name,subject_id,Marks_scored
0,Alex,sub1,98
1,Amy,sub2,90
2,Allen,sub4,87
3,Alice,sub6,69
4,Ayoung,sub5,78
5,Billy,sub2,89
6,Brian,sub4,80
7,Bran,sub3,79
8,Bryce,sub6,97
9,Betty,sub5,88


In [381]:
pd.concat([one,two],axis=1)

Unnamed: 0,Name,subject_id,Marks_scored,Name.1,subject_id.1,Marks_scored.1
1,Alex,sub1,98,Billy,sub2,89
2,Amy,sub2,90,Brian,sub4,80
3,Allen,sub4,87,Bran,sub3,79
4,Alice,sub6,69,Bryce,sub6,97
5,Ayoung,sub5,78,Betty,sub5,88


In [384]:
pd.concat([one,two],axis=0)

Unnamed: 0,Name,subject_id,Marks_scored
1,Alex,sub1,98
2,Amy,sub2,90
3,Allen,sub4,87
4,Alice,sub6,69
5,Ayoung,sub5,78
1,Billy,sub2,89
2,Brian,sub4,80
3,Bran,sub3,79
4,Bryce,sub6,97
5,Betty,sub5,88


In [382]:
one.append(two)

Unnamed: 0,Name,subject_id,Marks_scored
1,Alex,sub1,98
2,Amy,sub2,90
3,Allen,sub4,87
4,Alice,sub6,69
5,Ayoung,sub5,78
1,Billy,sub2,89
2,Brian,sub4,80
3,Bran,sub3,79
4,Bryce,sub6,97
5,Betty,sub5,88


In [383]:
one.append([two,one,two])

Unnamed: 0,Name,subject_id,Marks_scored
1,Alex,sub1,98
2,Amy,sub2,90
3,Allen,sub4,87
4,Alice,sub6,69
5,Ayoung,sub5,78
1,Billy,sub2,89
2,Brian,sub4,80
3,Bran,sub3,79
4,Bryce,sub6,97
5,Betty,sub5,88
