The filter() function in Python is used to filter elements from an iterable (like list, tuple, set) based on a condition (a function that returns True or False).


In [1]:
nums = [1, 2, 3, 4, 5, 6]

evens = filter(lambda x: x % 2 == 0, nums)

print(list(evens))  # [2, 4, 6]

[2, 4, 6]


The `map()` function in Python takes in a function and a list.

The function is called with all the items in the list and a new list is returned which contains items returned by that function for each item.

Here is an example use of `map()` function to double all the items in a list.

In [1]:
my_list = [1, 5, 4, 6, 8, 11, 3, 12]

new_list = list(map(lambda x: x * 2 , my_list))

print(new_list)

[2, 10, 8, 12, 16, 22, 6, 24]


In [None]:
import pandas as pd

students = {"name" : ["Tim Voss", "Nicole Johnson", "Elsa Williams", "John James", "Catherine Jones"], "age" : [19, 20, 21, 20, 23], "favorite_color" : ["red", "yellow", "green", "blue", "green"], "grade" : [91, 95, 82, 75, 93]}

students_df = pd.DataFrame(students)


In [None]:

# Write a function named grades_colors to select only the rows where the student’s favorite color is green or red and their grade is above 90.

#注意这个写法
students_df.loc[
    (students_df['favorite_color'].isin(['green','red'])) &
    (students_df['grade']>90) ]

In [15]:
import pandas as pd

# Creating a sample DataFrame
df = pd.DataFrame({
    'A': [10, 20, 30],
    'B': [40, 50, 60],
    'C': [70, 80, 90]
}, index=['row1', 'row2', 'row3'])

# Selecting the first row (index 0)
print(df.iloc[0])  # Output: A=10, B=40, C=70

# Selecting specific rows and columns (first two rows, first two columns)
print(df.iloc[0:2, 0:2])  # Output: A and B columns of row1 and row2

# Selecting a single value (row index 1, column index 2)
print(df.iloc[1, 2])  # Output: 80


print(df.iloc[0:2, ]) 

A    10
B    40
C    70
Name: row1, dtype: int64
       A   B
row1  10  40
row2  20  50
80
       A   B   C
row1  10  40  70
row2  20  50  80


In [7]:
#select columns with index positions 1 and 3
df.iloc[:, [1, 2]]
df.iloc[:, 0:2]

#select the first row
df.iloc[0]

A    10
B    40
C    70
Name: row1, dtype: int64

In [None]:
# using iloc (keep all except last 1)
df.iloc[:-1, :]

In [None]:
# using iloc (keep all except last 1)
df.iloc[:, :-1]

In [19]:
# using iloc (keep all except last 1)
df.iloc[2:, :]

Unnamed: 0,A,X,B,C,D
row3,30,7,60,90,300
row4,11,8,22,33,400
row_new,99,9,88,77,500


# 2. .loc[] (Label-based selection)

In [None]:
# Selecting a row by its label
print(df.loc['row1'])  # Output: A=10, B=40, C=70

# Selecting a specific value using row and column labels
print(df.loc['row2', 'C'])  # Output: 80

# Selecting multiple rows and columns by labels
print(df.loc[['row1', 'row3'], ['A', 'C']])  # Output: A and C columns of row1 and row3


# Selecting col A to C
print(df.loc[:, 'A':'C'])  

A    10
B    40
C    70
Name: row1, dtype: int64
80
       A   C
row1  10  70
row3  30  90
       A   B   C
row1  10  40  70
row2  20  50  80
row3  30  60  90


In [14]:
df

Unnamed: 0,A,X,B,C,D
row1,10,5.0,40,70,
row2,20,6.0,50,80,
row3,30,7.0,60,90,
row4,11,22.0,33,44,
row_new,99,,88,77,66.0


# 3. insert rows cols

In [16]:
df.loc['row4'] = [11, 22, 33]   # must match number of columns
new_row = pd.DataFrame({'A':[99], 'B':[88], 'C':[77]}, index=['row_new'])
df = pd.concat([df, new_row])
df

Unnamed: 0,A,B,C
row1,10,40,70
row2,20,50,80
row3,30,60,90
row4,11,22,33
row_new,99,88,77


In [17]:
df.insert(1, 'X', [5, 6, 7, 8, 9])  # insert at column index 1
df['D'] = [100, 200, 300, 400, 500]
df

Unnamed: 0,A,X,B,C,D
row1,10,5,40,70,100
row2,20,6,50,80,200
row3,30,7,60,90,300
row4,11,8,22,33,400
row_new,99,9,88,77,500


# Drop 

In [None]:
df = df.drop('D', axis=1) 
df = df.drop(['D','X'], axis=1)

In [None]:
df = df[df['A'] > 15] 

In [None]:
# using iloc (keep all except last 3)
df = df.iloc[:, :-3]

In [None]:
# using iloc (keep all except last 1)
df.iloc[:-1, :]

Unnamed: 0,A,X,B,C,D
row1,10,5,40,70,100
row2,20,6,50,80,200
row3,30,7,60,90,300
row4,11,8,22,33,400


# Other important subset skills

In [None]:
#To drop last n rows:
df.drop(df.tail(n).index,inplace=True) # drop last n rows


data3 = data2[data2["treatment"].isin(["uu1", "uu2" ,"uu3" ,"uu4" , "uu5"])]

above_35 = titanic[titanic["Age"] > 35]


#select rows with and without NA
wenjuan3 = wenjuan1[wenjuan1['question_tags_true'].isnull()]
wenjuan2 = wenjuan1.dropna(subset=['question_tags_true'])

In [None]:
#Get all rows that contain a specific substring
contain_values = df[df['month'].str.contains('Ju')]
#Get all rows that contain one substring OR another substring
contain_values = df[df['month'].str.contains('Ju|Ma')]



In [None]:

df = wine_reviews[(wine_reviews['points']>=95) & (wine_reviews['price']<1000)]

Handling Missing Values:

In [None]:
df = wine_reviews.dropna(subset=['points', 'price'])  # Remove rows with NaN in 'points' or 'price'

df = wine_reviews['price'].fillna(wine_reviews['price'].median(), inplace=True)


You’re given two dataframes: transactions and products.

The transactions dataframe contains transaction ids, product ids, and the total amount of each product sold.

The products dataframe contains product ids and prices.

Write a function to return a dataframe containing every transaction with a total value of over $100. Include the total value of the transaction as a new column in the dataframe.

In [1]:
import pandas as pd

transactions = {"transaction_id" : [1, 2, 3, 4, 5], "product_id" : [101, 102, 103, 104, 105], "amount" : [3, 5, 8, 3, 2]}

products = {"product_id" : [101, 102, 103, 104, 105], "price" : [20.00, 21.00, 15.00, 16.00, 52.00]}

df_transactions = pd.DataFrame(transactions)

df_products = pd.DataFrame(products)

In [3]:
df2 = df_transactions.merge(df_products, on='product_id', how='inner')
df2['total_value'] =  df2['price']*df2['amount']
df2 = df2[df2['total_value']>100]
df2

Unnamed: 0,transaction_id,product_id,amount,price,total_value
1,2,102,5,21.0,105.0
2,3,103,8,15.0,120.0
4,5,105,2,52.0,104.0
