### Consider a scenerio where you would want to get top features from given dataset. We'll explore the **movies** dataset in this example

In [1]:
import pandas as pd 
import numpy as np 

In [4]:
#Read in the movie dataset, and select the columns: movie_title, imdb_score, and budget

movie = pd.read_csv("../data/movie.csv")
movie2 = movie[["movie_title", "imdb_score", "budget"]]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [5]:
#select top 100 movies by imdb_score

movie2.nlargest(100, "imdb_score").head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [7]:
# return the five lowest budget films among those with a top 100 score

(
    movie2.nlargest(100, "imdb_score").nsmallest(
        5, "budget"
    )
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [13]:
# sort the dataframe by title_year
(
        movie[
             ["movie_title", "title_year", "imdb_score"]
         ].sort_values("title_year", ascending=True)
 )

Unnamed: 0,movie_title,title_year,imdb_score
4695,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0
4833,Over the Hill to the Poorhouse,1920.0,4.8
4767,The Big Parade,1925.0,8.3
2694,Metropolis,1927.0,8.3
4697,The Broadway Melody,1929.0,6.3
...,...,...,...
4683,Heroes,,7.7
4688,Home Movies,,8.2
4704,Revolution,,6.7
4752,Happy Valley,,8.5


In [14]:
# Notice how only the year was sorted. To sort multiple columns at once, use a list. Let's look at how to sort both year and score

(
        movie[
             ["movie_title", "title_year", "imdb_score"]
         ].sort_values(
             ["title_year", "imdb_score"], ascending=False)
 )

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
4277,A Beginner's Guide to Snuff,2016.0,8.7
3798,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2
...,...,...,...
1391,Rush Hour,,5.8
4031,Creature,,5.0
2165,Meet the Browns,,3.5
3246,The Bold and the Beautiful,,3.5


In [15]:
# we use the .drop_duplicates method to keep only the first row of every year

(
        movie[
             ["movie_title", "title_year", "imdb_score"]
         ].sort_values(
             ["title_year", "imdb_score"], ascending=False)
             .drop_duplicates(subset="title_year")
 )

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
2694,Metropolis,1927.0,8.3
4767,The Big Parade,1925.0,8.3
4833,Over the Hill to the Poorhouse,1920.0,4.8
4695,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0


In [20]:
# The above operation can also be done using groupby

(
    movie[["movie_title", "title_year", "imdb_score"]]
    .groupby("title_year", as_index=False)
    .apply(
        lambda df:df.sort_values(
            "imdb_score", ascending=False
        ).head(1)
    )
    .droplevel(0)
    .sort_values("title_year", ascending=False)
)


Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
4555,Pandora's Box,1929.0,8.0
2694,Metropolis,1927.0,8.3
4767,The Big Parade,1925.0,8.3
4833,Over the Hill to the Poorhouse,1920.0,4.8


### It is possible to sort one column in ascending order while simultaneously sorting another column in descending order. To accomplish this, pass in a list of Booleans to the ascending parameter that corresponds to how you would like each column sorted. 

In [21]:
(
     movie[
         [
             "movie_title",
             "title_year",
             "content_rating",
             "budget",
         ]
              ]
     .sort_values(
         ["title_year", "content_rating", "budget"],
         ascending=[False, False, True],
     )
     .drop_duplicates(
         subset=["title_year", "content_rating"]
     )
)

Unnamed: 0,movie_title,title_year,content_rating,budget
4026,Compadres,2016.0,R,3000000.0
4658,Fight to the Finish,2016.0,PG-13,150000.0
4661,Rodeo Girl,2016.0,PG,500000.0
3252,The Wailing,2016.0,Not Rated,
4659,Alleluia! The Devil's Carnival,2016.0,,500000.0
...,...,...,...,...
2558,Lilyhammer,,TV-MA,34000000.0
807,"Sabrina, the Teenage Witch",,TV-G,3000000.0
848,Stargate SG-1,,TV-14,1400000.0
2436,Carlos,,Not Rated,


### By default, .drop_duplicates keeps the very first appearance of a value, but this behavior may be modified by passing keep='last' to select the last row of each group or keep=False to drop all duplicates entirely.