# Extracting, filtering, and transforming data from DataFrames Advanced indexing with multiple levels

# Positional and labeled indexing

In [None]:
# You can use a number of different methods
# Indexing using square brackets - using this method you put column first in square brackets then row!
df['salt']['Jan']
# Using column attribute and row label - here eggs is the column attribute
df.eggs['Mar']
# Using the .loc accessor - here use row first then column!
df.loc['May','spam']
# Using the .iloc accessor - again row index first then column index
df.iloc[4, 2]

In [None]:
# Selecting only some columns - to return a dataframe you must use a nested list within square brackets!!
df_new = df[['spam','eggs']]

In [None]:
# Indexing and column rearrangement

# Import pandas
import pandas as pd

# Read in filename and set the index: election
election = pd.read_csv(filename, index_col='county')

# Create a separate dataframe with the columns ['winner', 'total', 'voters']: results
results = election[['winner', 'total', 'voters']]

# Print the output of results.head()
print(results.head())

In [None]:
# Selecting a column, i.e. a Series
df['eggs']

In [None]:
# Selecting part of a column OR a slicing and indexing a Series
df['eggs'][1:4] # selects 2nd, 3rd and 4th rows
df['eggs'][4] # selects 5th row

In [None]:
# Selecting all rows and some columns - the slice is inclusive of right hand argument
df.loc[:,'eggs':'spam']
# Selecting all columns and some rows - the slice is inclusive of right hand argument
df.loc['Jan':'Feb',:]
# Slicing both rows and columns - the slice is inclusive of right hand argument
df.loc['Jan':'Feb', 'eggs':'spam']

In [None]:
# Using iloc to slice rows and columns
df.iloc[2:5,1:]

In [None]:
# You can also use lists rather than slices!!
df.loc['Jan':'Feb', ['eggs','spam']]
# Note with iloc the slice 0:2 only selects 2 columns!
df.iloc[[0,4,5],0:2]

In [None]:
# Selecting a Series vs. a 1 column dataframe
df['eggs'] # A series
df[['eggs']] # A dataframe

# Slicing rows

In [None]:
# Slice the row labels 'Perry' to 'Potter': p_counties
p_counties = election.loc['Perry':'Potter']

# Print the p_counties DataFrame
print(p_counties)

# Slice the row labels 'Potter' to 'Perry' in reverse order: p_counties_rev
p_counties_rev = election.loc['Potter':'Perry':-1]

# Print the p_counties_rev DataFrame
print(p_counties_rev)

# Slicing columns

In [None]:
# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:, :'Obama']

# Print the output of left_columns.head()
print(left_columns.head())

# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:,'Obama':'winner']

# Print the output of middle_columns.head()
print(middle_columns.head())

# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:,'Romney':]

# Print the output of right_columns.head()
print(right_columns.head())

# Subselecting DataFrames with lists

In [None]:
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']

# Create the new DataFrame: three_counties
three_counties = election.loc[rows,cols]

# Print the three_counties DataFrame
print(three_counties)

# Filtering data

In [None]:
# Creating a boolean series (mask)

In [None]:
df.salt>60 # gives true or false responses
df[df.salt>60]
# Alternatively you can assign the filter to another variable
enough_salt_sold = df.salt > 60
df[enough_salt_sold]

In [None]:
# Create the boolean array: high_turnout
high_turnout = election.turnout > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Print the high_turnout_results DataFrame
print(high_turnout_df)

In [None]:
# Combining filters
df[(df.salt >= 50) & (df.eggs < 200)] # Both conditions
df[(df.salt >= 50) | (df.eggs < 200)] # Either condition

# Filtering columns using other columns

In [None]:
# Filtering a column based on another column!!
df.eggs[df.salt > 55]
# Modifying a column based on another column
df.eggs[df.salt > 55] += 5

In [None]:
# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election['margin'] < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election.loc[too_close, 'winner'] = np.nan

# Print the output of election.info()
print(election.info())

# Filtering using NaNs

In [None]:
# Select columns with all nonzeros
df2.loc[:, df2.all()]
# Select columns with any nonzeros
df2.loc[:, df2.any()]
# Select columns with any NaNs
df.loc[:, df.isnull().any()]
# Select columns without NaNs
df.loc[:, df.notnull().all()]
# In certain scenarios, it may be necessary to remove rows and columns with missing data from a DataFrame
# The .dropna() method is used to perform this action
# Drop rows with any NaNs
df.dropna(how='any')

In [None]:
# Select the 'age' and 'cabin' columns: df
df = titanic.loc[:,['age','cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how='all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())

# Transforming data

# Using apply() to transform a column

In [None]:
# The .apply() method can be used on a pandas DataFrame to apply an arbitrary Python function to every element

# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF','Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())

# Using .map() with a dictionary

In [None]:
# The .map() method is used to transform values according to a Python dictionary look-up

# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue', 'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election.winner.map(red_vs_blue)

# Print the output of election.head()
print(election.head())

# Using vectorized functions (called Universal Functions or UFuncs in Numpy)

In [None]:
# When performance is paramount, you should avoid using .apply() and .map() because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame
# By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! 
# NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy)

# You can even write your own vectorized functions, but for now we will focus on the ones distributed by NumPy and pandas.


In [None]:
# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())

# Index objects and labeled data

In [1]:
# Creating a Series
import pandas as pd
prices = [10.70, 10.86, 10.74, 10.71, 10.79]
shares = pd.Series(prices)
print(shares)

0    10.70
1    10.86
2    10.74
3    10.71
4    10.79
dtype: float64


In [2]:
# Creating an index for the Series
days = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri']
shares = pd.Series(prices, index=days)
print(shares)

Mon     10.70
Tue     10.86
Wed     10.74
Thur    10.71
Fri     10.79
dtype: float64


In [3]:
# Examining the index
print(shares.index)
print(shares.index[2])
print(shares.index[:2])
print(shares.index[-2:])
print(shares.index.name)

Index(['Mon', 'Tue', 'Wed', 'Thur', 'Fri'], dtype='object')
Wed
Index(['Mon', 'Tue'], dtype='object')
Index(['Thur', 'Fri'], dtype='object')
None


In [4]:
# Modifying the index name
shares.index.name = 'weekday'
print(shares)

weekday
Mon     10.70
Tue     10.86
Wed     10.74
Thur    10.71
Fri     10.79
dtype: float64


# Changing index of a DataFrame

In [5]:
# Indexes are immutable objects. 
# This means that if you want to change or modify the index in a dataframe, then you need to change the whole index. 

# A list comprehension is a succinct way to generate a list in one line. 
# For example, the following list comprehension generates a list that contains the cubes of all numbers from 0 to 9: 
cubes = [i**3 for i in range(10)]
print(cubes)

[0, 1, 8, 27, 64, 125, 216, 343, 512, 729]


In [None]:
# Create the list of new indexes: new_idx
new_idx = [month.upper() for month in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)

In [None]:
# Changing index name labels

# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)

In [None]:
# Building an index, then a DataFrame
# You can also build the DataFrame and index independently, and then put them together. 
# If you take this route, be careful, as any mistakes in generating the DataFrame or the index can cause the data and the index to be aligned incorrectly.
# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

# Assign months to sales.index
sales.index = months

# Print the modified sales DataFrame
print(sales)

# Hierarchical indexing (or multiindex)

In [None]:
# Setting & sorting a MultiIndex

# Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state', 'month'])

# Sort the MultiIndex: sales
sales = sales.sort_index()

# Print the sales DataFrame
print(sales)

In [None]:
# Using .loc[] with nonunique indexes
# it is always preferable to have a meaningful index that uniquely identifies each row
# Even though pandas does not require unique index values in DataFrames, it works better if the index values are indeed unique

# Set the index to the column 'state': sales
sales = sales.set_index(['state'])

# Print the sales DataFrame
print(sales)

# Access the data from 'NY'
print(sales.loc['NY'])

In [None]:
# Indexing multiple levels of a MultiIndex
# Looking up indexed data is fast and efficient
# And you have already seen that lookups based on the outermost level of a MultiIndex work just like lookups on DataFrames that have a single-level Index.

# The trickiest of all these lookups are when you want to access some inner levels of the index
# In this case, you need to use slice(None) in the slicing parameter for the outermost dimension(s) instead of the usual :, or use pd.IndexSlice. 
# You can refer to the pandas documentation for more details

# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[('NY', 1), :]

# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[(['CA', 'TX'], 2), :]

# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[(slice(None), 2), :]

In [None]:

# Tidying, rearranging and restructuring your data
# Pivoting, melting, and stacking DataFrames
# Identifying and splitting DataFrames by groups