# Introduction to Pandas

In [None]:
import pandas as pd

In [None]:
import numpy as np

## Fundamental panda data structures

### `Series` objects in pandas

In [None]:
series_example = pd.Series([-0.5, 0.75, 1.0, -2])
series_example

In [None]:
series_example.values

In [None]:
series_example.index

In [None]:
series_example[1]

In [None]:
series_example[1:3]

### Explicit Indices

In [None]:
series_example2 = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
series_example2

In [None]:
series_example2['b']

### Exercise:

In [None]:
# Do explicit Series indices work *exactly* the way you might expect?
# Try slicing series_example2 using its explicit index and find out.


### Series vs Dictionary

**Think, Pair, Share** 

In [None]:
population_dict = {'France': 65429495,
                   'Germany': 82408706,
                   'Russia': 143910127,
                   'Japan': 126922333}
population_dict

In [None]:
population = pd.Series(population_dict)
population

### Interacting with Series

In [None]:
population['Russia']

### Exercise

In [None]:
# Try slicing on the population Series on your own.
# Would slicing be possible if Series keys were not ordered?
population['Germany':'Russia']

In [None]:
# Try running population['Albania'] = 2937590 (or another country of your choice)
# What order do the keys appear in when you run population? Is it what you expected?


In [None]:
population

In [None]:
pop2

In [None]:
pop2 = pd.Series({'Spain': 46432074, 'France': 102321, 'Albania': 50532})
population + pop2

### `DataFrame` object in pandas

In [None]:
area_dict = {'Albania': 28748,
             'France': 643801,
             'Germany': 357386,
             'Japan': 377972,
             'Russia': 17125200}
area = pd.Series(area_dict)
area

In [None]:
countries = pd.DataFrame({'Population': population, 'Area': area})
countries

In [None]:
countries['Capital'] = ['Tirana', 'Paris', 'Berlin', 'Tokyo', 'Moscow']
countries

In [None]:
countries = countries[['Capital', 'Area', 'Population']]
countries

In [None]:
countries['Population Density'] = countries['Population'] / countries['Area']
countries

In [None]:
countries['Area']

### Exercise

In [None]:
# Now try accessing row data with a command like countries['Japan']


**Think, Pair, Share**

In [None]:
countries.loc['Japan']

In [None]:
countries.loc['Japan']['Area']

### Exercise

In [None]:
# Can you think of a way to return the area of Japan without using .iloc?
# Hint: Try putting the column index first.
# Can you slice along these indices as well?


### DataSeries Creation

In [None]:
countries['Debt-to-GDP Ratio'] = np.nan
countries

In [None]:
debt = pd.Series([0.19, 2.36], index=['Russia', 'Japan'])
countries['Debt-to-GDP Ratio'] = debt
countries

In [None]:
del countries['Capital']
countries

In [None]:
countries.T

In [None]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

## Manipulating data in pandas

### Index objects in pandas

In [None]:
series_example = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
ind = series_example.index
ind

In [None]:
ind[1]

In [None]:
ind[::2]

**Share**

In [None]:
ind[1] = 0

### Set Properties

In [None]:
ind_odd = pd.Index([1, 3, 5, 7, 9])
ind_prime = pd.Index([2, 3, 5, 7, 11])

**Think, Pair, Share**  
In the code cell below, try out the intersection (`ind_odd & ind_prime`), union (`ind_odd | ind_prime`), and the symmetric difference (`ind_odd ^ ind_prime`) of `ind_odd` and `ind_prime`.

### Data Selection in Series

In [None]:
series_example2 = pd.Series([-0.5, 0.75, 1.0, -2], index=['a', 'b', 'c', 'd'])
series_example2

In [None]:
series_example2['b']

In [None]:
'a' in series_example2

In [None]:
series_example2.keys()

In [None]:
list(series_example2.items())

In [None]:
series_example2['e'] = 1.25
series_example2

### Indexers: `loc` and `iloc`

**Think, Pair, Share**

In [None]:
series_example2.loc['a']

In [None]:
series_example2.loc['a':'c']

**Share**

In [None]:
series_example2.iloc[0]

In [None]:
series_example2.iloc[0:2]

### Data Selection in DataFrames

In [None]:
area = pd.Series({'Albania': 28748,
                  'France': 643801,
                  'Germany': 357386,
                  'Japan': 377972,
                  'Russia': 17125200})
population = pd.Series ({'Albania': 2937590,
                         'France': 65429495,
                         'Germany': 82408706,
                         'Russia': 143910127,
                         'Japan': 126922333})
countries = pd.DataFrame({'Area': area, 'Population': population})
countries

In [None]:
countries['Area']

In [None]:
countries['Population Density'] = countries['Population'] / countries['Area']
countries

### DataFrame as two-dimensional array

In [None]:
countries.values

In [None]:
countries.T

In [None]:
countries.iloc[:3, :2]

In [None]:
countries.loc[:'Germany', :'Population']

### Exercise

In [None]:
# Can you think of how to combine masking and fancy indexing in one line?
# Your masking could be somthing like countries['Population Density'] > 200
# Your fancy indexing could be something like ['Population', 'Population Density']
# Be sure to put the the masking and fancy indexing inside the square brackets: countries.loc[]


# Operating on Data in Pandas

**Think, Pair, Share** For each of these Sections.

## Index alignment with Series

For our first example, suppose we are combining two different data sources and find only the top five countries by *area* and the top five countries by *population*:

In [None]:
area = pd.Series({'Russia': 17075400, 'Canada':  9984670,
                  'USA': 9826675, 'China': 9598094, 
                  'Brazil': 8514877}, name='area')
population = pd.Series({'China': 1409517397, 'India': 1339180127,
                        'USA': 324459463, 'Indonesia': 322179605, 
                        'Brazil': 207652865}, name='population')

In [None]:
# Now divide these to compute the population density
pop_density = area/population
pop_density

In [None]:
series1 = pd.Series([2, 4, 6], index=[0, 1, 2])
series2 = pd.Series([3, 5, 7], index=[1, 2, 3])
series1 + series2

In [None]:
series1.add(series2, fill_value=0)

Much better!

## Index alignment with DataFrames

In [None]:
rng = np.random.RandomState(42)
df1 = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                   columns=list('AB'))
df1

In [None]:
df2 = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                   columns=list('BAC'))
df2

In [None]:
# Add df1 and df2. Is the output what you expected?
df1 + df2

In [None]:
fill = df1.stack().mean()
df1.add(df2, fill_value=fill)

## Operations between DataFrames and Series

Index and column alignment gets maintained in operations between a `DataFrame` and a `Series` as well. To see this, consider a common operation in data science, wherein we find the difference of a `DataFrame` and one of its rows. Because pandas inherits ufuncs from NumPy, pandas will compute the difference row-wise by default:

In [None]:
df3 = pd.DataFrame(rng.randint(10, size=(3, 4)), columns=list('WXYZ'))
df3

In [None]:
df3 - df3.iloc[0]

In [None]:
df3.subtract(df3['X'], axis=0)

In [None]:
halfrow = df3.iloc[0, ::2]
halfrow

In [None]:
df3 - halfrow