30 days of random Pandas

In [2]:
import pandas as pd

In [8]:
# Day 1 
# Remove all contries that do not meet the requirement of area >= 3000000 or population >= 25000000 
# iterate / apply to all rows this filter
data = [['Afghanistan', 'Asia', 652230, 25500100, 20343000000], ['Albania', 'Europe', 28748, 2831741, 12960000000], ['Algeria', 'Africa', 2381741, 37100000, 188681000000], ['Andorra', 'Europe', 468, 78115, 3712000000], ['Angola', 'Africa', 1246700, 20609294, 100990000000]]
world = pd.DataFrame(data, columns=['name', 'continent', 'area', 'population', 'gdp']).astype({'name':'object', 'continent':'object', 'area':'Int64', 'population':'Int64', 'gdp':'Int64'})
world.head()

Unnamed: 0,name,continent,area,population,gdp
0,Afghanistan,Asia,652230,25500100,20343000000
1,Albania,Europe,28748,2831741,12960000000
2,Algeria,Africa,2381741,37100000,188681000000
3,Andorra,Europe,468,78115,3712000000
4,Angola,Africa,1246700,20609294,100990000000


In [13]:
def big_countries(world: pd.DataFrame) -> pd.DataFrame:
    df = world.drop(world[(world.area < 3000000) & (world.population < 25000000)].index)
    df = df.filter(["name", "population", "area"], axis=1)
    return df
big_countries(world)

# Accepted solution RT: 410ms 61% MEM: 68mb 90%

Unnamed: 0,name,population,area
0,Afghanistan,25500100,652230
2,Algeria,37100000,2381741


In [2]:
# Day 2
import pandas as pd
data = [['0', 'Y', 'N'], ['1', 'Y', 'Y'], ['2', 'N', 'Y'], ['3', 'Y', 'Y'], ['4', 'N', 'N']]
products = pd.DataFrame(data, columns=['product_id', 'low_fats', 'recyclable']).astype({'product_id':'int64', 'low_fats':'category', 'recyclable':'category'})

# Find id's of products that are low fat and recyclable
# Filter by conditions that we have low_fat and recyclable both equal to Y
# thus drop any rows which have those fields equal to N 

In [6]:
# Select rows bassed on a condition - Applies this condition using expansion
def find_products(products: pd.DataFrame) -> pd.DataFrame:
    df = products[(products["low_fats"] == "Y") & (products["recyclable"] == "Y")]
    return df.filter(["product_id"])
find_products(products)
# Accepted solution same as editortial

Unnamed: 0,product_id
1,1
3,3


In [7]:
# Day 3 customer who never orders
import pandas as pd
data = [[1, 'Joe'], [2, 'Henry'], [3, 'Sam'], [4, 'Max']]
customers = pd.DataFrame(data, columns=['id', 'name']).astype({'id':'Int64', 'name':'object'})
data = [[1, 3], [2, 1]]
orders = pd.DataFrame(data, columns=['id', 'customerId']).astype({'id':'Int64', 'customerId':'Int64'})

In [33]:

# For each customer check if there exist an entry in order
# Filter order for that customer id if empty then add it to theend lis
def find_customers(customers: pd.DataFrame, orders: pd.DataFrame) -> pd.DataFrame:
    # Generate a boolean_series using customer["id"].isin(order..) then invert it
    # Use the Pandas method for 'index' a boolean series into a dataframe
    series = ~customers["id"].isin(orders["customerId"])
    # filter applies to columns, in this case keeps only the "name" column
    df = customers[series].filter(["name"])
    df = df.rename(columns={"name":"Customers"})
    return df
find_customers(customers, orders)
# Acepted solution 448ms 59% and 67mb 52%

Unnamed: 0,Customers
1,Henry
3,Max


In [49]:
# Day4
import pandas as pd

data = [[1, 3, 5, '2019-08-01'], [1, 3, 6, '2019-08-02'], [2, 7, 7, '2019-08-01'], [2, 7, 6, '2019-08-02'], [4, 7, 1, '2019-07-22'], [3, 4, 4, '2019-07-21'], [3, 4, 4, '2019-07-21']]
views = pd.DataFrame(data, columns=['article_id', 'author_id', 'viewer_id', 'view_date']).astype({'article_id':'Int64', 'author_id':'Int64', 'viewer_id':'Int64', 'view_date':'datetime64[ns]'})

In [58]:
# Find all authors who viewed their own article i.e., rows where author_id == viewer_id
# Then filter authors and remove duplicates
def article_views(views: pd.DataFrame) -> pd.DataFrame:
    bool_series = views["author_id"] == views["viewer_id"]
    df = views[bool_series]
    df = df.filter(["author_id"])
    df = df.rename(columns={"author_id":"id"})
    df = df.drop_duplicates()
    df = df.sort_values("id") # Sort them by "id"
    return df
article_views(views)
# Accepted 315ms 85% and 67mb 48%

Unnamed: 0,id
5,4
2,7


In [59]:
# day5
import pandas as pd
data = [[1, 'Let us Code'], [2, 'More than fifteen chars are here!']]
tweets = pd.DataFrame(data, columns=['tweet_id', 'content']).astype({'tweet_id':'Int64', 'content':'object'})

In [71]:
import pandas as pd

# Invalid tweet means the content lenght is greater than 15 char

# Optimized for pandas since it uses vectorize operation instead of apply and lambda
def invalid_tweets(tweets: pd.DataFrame) -> pd.DataFrame:
    tweets["len"] = tweets["content"].str.len()
    return tweets[tweets["len"] > 15][["tweet_id"]]
    # Accepted 512ms 46 and 66mb 82%

# This is not optimal since it doens't utilize vectorized operation which pandas is optimized for
def invalid_tweets_notoptimal(tweets: pd.DataFrame) -> pd.DataFrame:
    # Use df.apply to map a value on each row when we specify axis=1
    tweets["len"] = tweets.apply(lambda row : len(row["content"]), axis=1)
    return tweets[tweets["len"] > 15].filter(["tweet_id"])
    # Accepted 445ms 66% and 67mb 13%

res = invalid_tweets(tweets)
res

# Accepted 512ms 46 and 66mb 82%

Unnamed: 0,tweet_id
1,2
