# Introduction to Pandas Library Part-1

## 1. Basic Concepts

#### Pandas is a column oriented data analysis API. It's a great tool for handling and analyzing input data, and many ML frameworks support pandas data structures as inputs.
#### There primary data structures in pandas are:
#### - DataFrame, which is nothing but a relational table consisting of rows and columns
#### - Series, which is a single column. So, a DataFrame is basically made up of many series

In [32]:
import pandas as pd
pd.__version__

'0.21.0'

In [33]:
# creating series object
names = pd.Series(['Sam','Bill','John'])
names

0     Sam
1    Bill
2    John
dtype: object

In [94]:
# creating DataFrame object by passing a dictionary 
age = pd.Series([12,26,40])
df = pd.DataFrame({'Name':names, 'Age':age})
df

Unnamed: 0,Age,Name
0,12,Sam
1,26,Bill
2,40,John


In [93]:
california_df = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
california_df.head()  # by default returns 5 records
# california_df.head(20) returns first 20 records

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [73]:
california_df.tail() # returns the bottom records
california_df.tail(10)

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
41.73,-124.22,28.0,3003.0,699.0,1530.0,653.0,1.7038,78300.0
41.75,-124.23,11.0,3159.0,616.0,1343.0,479.0,2.4805,100000.0
40.81,-124.23,52.0,1112.0,209.0,544.0,172.0,3.3462,50800.0
40.54,-124.23,52.0,2694.0,453.0,1152.0,435.0,3.0806,100000.0
40.28,-124.25,32.0,1430.0,419.0,434.0,187.0,1.9417,76100.0
40.58,-124.26,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
40.69,-124.27,36.0,2349.0,528.0,1194.0,465.0,2.5179,100000.0
41.84,-124.3,17.0,2677.0,531.0,1244.0,456.0,3.0313,100000.0
41.8,-124.3,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0
40.54,-124.35,52.0,1820.0,300.0,806.0,270.0,3.0147,94600.0


In [75]:
# getting the data types of all columns
california_df.dtypes

longitude             float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
dtype: object

In [76]:
#getting the column names
california_df.columns 

Index(['longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms',
       'population', 'households', 'median_income', 'median_house_value'],
      dtype='object')

In [80]:
#tranposing your data
california_df.T

latitude,34.19,34.4,33.69,33.64,33.57,33.63,33.61,34.83,33.61.1,34.83.1,...,41.73,41.75,40.81,40.54,40.28,40.58,40.69,41.84,41.8,40.54.1
longitude,-114.31,-114.47,-114.56,-114.57,-114.57,-114.58,-114.58,-114.59,-114.59,-114.6,...,-124.22,-124.23,-124.23,-124.23,-124.25,-124.26,-124.27,-124.3,-124.3,-124.35
housing_median_age,15.0,19.0,17.0,14.0,20.0,29.0,25.0,41.0,34.0,46.0,...,28.0,11.0,52.0,52.0,32.0,52.0,36.0,17.0,19.0,52.0
total_rooms,5612.0,7650.0,720.0,1501.0,1454.0,1387.0,2907.0,812.0,4789.0,1497.0,...,3003.0,3159.0,1112.0,2694.0,1430.0,2217.0,2349.0,2677.0,2672.0,1820.0
total_bedrooms,1283.0,1901.0,174.0,337.0,326.0,236.0,680.0,168.0,1175.0,309.0,...,699.0,616.0,209.0,453.0,419.0,394.0,528.0,531.0,552.0,300.0
population,1015.0,1129.0,333.0,515.0,624.0,671.0,1841.0,375.0,3134.0,787.0,...,1530.0,1343.0,544.0,1152.0,434.0,907.0,1194.0,1244.0,1298.0,806.0
households,472.0,463.0,117.0,226.0,262.0,239.0,633.0,158.0,1056.0,271.0,...,653.0,479.0,172.0,435.0,187.0,369.0,465.0,456.0,478.0,270.0
median_income,1.4936,1.82,1.6509,3.1917,1.925,3.3438,2.6768,1.7083,2.1782,2.1908,...,1.7038,2.4805,3.3462,3.0806,1.9417,2.3571,2.5179,3.0313,1.9797,3.0147
median_house_value,66900.0,80100.0,85700.0,73400.0,65500.0,74000.0,100000.0,48500.0,100000.0,48100.0,...,78300.0,100000.0,50800.0,100000.0,76100.0,111400.0,100000.0,100000.0,85800.0,94600.0


In [83]:
# sorting values by a particular column
california_df.sort_values(by='median_house_value').head()

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
california_df.describe()  # generates interesting statistics about the dataframe such as shape, dispersion. This will only display numeric columns

# california_df.describe(include='all') to describe all type of columns

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## 2. Accessing/ Indexing Data

In [37]:
california_df['latitude']
# california_df.latitude ; alternate way to access any single column data
california_df['latitude'][:5] #indexing similar to python lists

0    34.19
1    34.40
2    33.69
3    33.64
4    33.57
Name: latitude, dtype: float64

In [38]:
california_df[['latitude', 'longitude']]  # to access more than one column you can pass a list of column names
california_df[['latitude', 'longitude']].head(10)

Unnamed: 0,latitude,longitude
0,34.19,-114.31
1,34.4,-114.47
2,33.69,-114.56
3,33.64,-114.57
4,33.57,-114.57
5,33.63,-114.58
6,33.61,-114.58
7,34.83,-114.59
8,33.61,-114.59
9,34.83,-114.6


### loc Method

In [39]:
''' Using loc for selecting rows by labels/index
we need to set an index column to access the rows if not set'''
california_df.set_index('latitude', inplace=True)  # inplace - Modify the DataFrame in place
california_df.head(5) # now we can easily access the rows using latitude values

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.19,-114.31,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
34.4,-114.47,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
33.69,-114.56,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
33.64,-114.57,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
33.57,-114.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [43]:
california_df.loc[34.19].head()  #locating all rows with specific latitude value

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.19,-114.31,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
34.19,-116.88,38.0,898.0,259.0,106.0,52.0,1.6875,225000.0
34.19,-117.32,6.0,1068.0,182.0,999.0,188.0,4.7222,109000.0
34.19,-118.11,50.0,1430.0,186.0,620.0,201.0,9.532,483300.0
34.19,-118.12,52.0,2405.0,299.0,970.0,319.0,8.7835,444100.0


In [44]:
#locating rows with multiple latitude values by passing a list ; returns a dataframe
california_df.loc[[34.19, 34.40]].head() 

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.19,-114.31,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
34.19,-116.88,38.0,898.0,259.0,106.0,52.0,1.6875,225000.0
34.19,-117.32,6.0,1068.0,182.0,999.0,188.0,4.7222,109000.0
34.19,-118.11,50.0,1430.0,186.0,620.0,201.0,9.532,483300.0
34.19,-118.12,52.0,2405.0,299.0,970.0,319.0,8.7835,444100.0


In [47]:
# selecting specific columns using specific index values
california_df.loc[[34.19, 34.40], ['longitude','housing_median_age','total_rooms']].head()

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34.19,-114.31,15.0,5612.0
34.19,-116.88,38.0,898.0
34.19,-117.32,6.0,1068.0
34.19,-118.11,50.0,1430.0
34.19,-118.12,52.0,2405.0


In [None]:
# selecting  all columns between 'longitude':'total_rooms' using specific index values
california_df.loc[[34.19, 34.40], 'longitude':'total_rooms'].head()

### iloc Method

In [53]:
'''Using boolean logic / conditinal statements for selecting rows using loc'''
# california_df.columns
california_df.loc[california_df.population > 500].head() #selecting all columns
# selecting specific columns
california_df.loc[california_df.population > 500, 'longitude':'households'].head()

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
34.19,-114.31,15.0,5612.0,1283.0,1015.0,472.0
34.4,-114.47,19.0,7650.0,1901.0,1129.0,463.0
33.64,-114.57,14.0,1501.0,337.0,515.0,226.0
33.57,-114.57,20.0,1454.0,326.0,624.0,262.0
33.63,-114.58,29.0,1387.0,236.0,671.0,239.0


In [58]:
# using lambda
california_df.loc[lambda california_df:california_df['population']>100].head()

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.19,-114.31,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
34.4,-114.47,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
33.69,-114.56,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
33.64,-114.57,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
33.57,-114.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [59]:
# selecting a range values
california_df.loc[(california_df['population']>500) & (california_df['population']<1000) ].head()

# alternatively
california_df.loc[california_df['population'].between(500,1000) ].head()

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
33.64,-114.57,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
33.57,-114.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
33.63,-114.58,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0
34.83,-114.6,46.0,1497.0,309.0,787.0,271.0,2.1908,48100.0
34.84,-114.61,48.0,1291.0,248.0,580.0,211.0,2.1571,48600.0


In [60]:
# setting values using loc
california_df.loc[california_df['median_income'].between(0,1)] = 0

In [70]:
california_df.loc[(california_df.median_income > 2) & (california_df.population>1000), 'median_house_value'] = 100000

In [62]:
'''Using iloc for indexing/selecting records using pandas. iloc uses integer-location based indexing unlike loc which uses
label based indexing'''

california_df.iloc[0].head()  # selecting the first row ; Note: returns a series
california_df.iloc[1].head()  # selecting the second row 
california_df.iloc[-1].head() # selecting the last row

longitude             -124.35
housing_median_age      52.00
total_rooms           1820.00
total_bedrooms         300.00
population             806.00
Name: 40.54, dtype: float64

In [64]:
# selecting columns\
california_df.iloc[:,0].head() # first column of data frame 
california_df.iloc[:,1].head() # second column of data frame
california_df.iloc[:,-1].head() # last column of data frame 

latitude
34.19    66900.0
34.40    80100.0
33.69    85700.0
33.64    73400.0
33.57    65500.0
Name: median_house_value, dtype: float64

In [65]:
# selecting multiple columns and rows together
california_df.iloc[0:5].head() # first five rows of dataframe

Unnamed: 0_level_0,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
34.19,-114.31,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
34.4,-114.47,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
33.69,-114.56,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
33.64,-114.57,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
33.57,-114.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [66]:
# first two columns of data frame with all rows
california_df.iloc[:, 0:2].head()

Unnamed: 0_level_0,longitude,housing_median_age
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1
34.19,-114.31,15.0
34.4,-114.47,19.0
33.69,-114.56,17.0
33.64,-114.57,14.0
33.57,-114.57,20.0


In [68]:
# 1st, 4th, 7th, 25th row + 1st 6th 7th columns.
california_df.iloc[[0,3,6,24], [0,5,6]].head()

Unnamed: 0_level_0,longitude,households,median_income
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34.19,-114.31,472.0,1.4936
33.64,-114.57,226.0,3.1917
33.61,-114.58,633.0,2.6768
33.54,-115.22,283.0,1.625


In [84]:
## first 5 rows and 5th, 6th, 7th columns of data frame
california_df.iloc[0:5, 5:8]

Unnamed: 0_level_0,households,median_income,median_house_value
latitude,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34.19,472.0,1.4936,66900.0
34.4,463.0,1.82,80100.0
33.69,117.0,1.6509,85700.0
33.64,226.0,3.1917,73400.0
33.57,262.0,1.925,65500.0


### The below image can summarize the syntax for loc and iloc methods

<img src="Pandas-selections-and-indexing.png">

#### For more examples you can check out following links
https://pandas.pydata.org/pandas-docs/stable/10min.html                                                                          
https://pandas.pydata.org/pandas-docs/stable/cookbook.html