In [1]:
import pandas as pd

# DataFrames vs Series
- A Series is a one-dimensional data structure (one identifier to extract a value, such as a index position or index value)
- A DataFrame is a two-dimensional data structure (two identifiers to extract a value, such as a row and column)

In [2]:
# import nba data as dataframe
nba = pd.read_csv('data/nba.csv')

# impot revenue data as dataframe
revenue = pd.read_csv('data/revenue.csv', index_col = 'Date') # using the data column as a row index

# Methods & Attributes

In [3]:
# head method
nba.head() # default n = 5

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [4]:
# tail method
nba.head() # default n = 5

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [5]:
# index attribute (range of index positions)
nba.index

RangeIndex(start=0, stop=458, step=1)

In [6]:
# values attributes (multi-dimensional array)
nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [7]:
# shape attribute (object dimensions: rows, columns)
nba.shape

(458, 9)

In [8]:
# dtype attribute (underlying data types for each column)
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [9]:
# columns attribute (collection of column index positions)
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [10]:
# axes attribute (collection of row and column index positions)
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [11]:
# info method (displays information of dataframe object)
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [12]:
# describe method (summary statistics)
nba.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [13]:
# notna method (boolean values for non-NaN values)
nba.notna()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,False
3,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,False,True
...,...,...,...,...,...,...,...,...,...
453,True,True,True,True,True,True,True,True,True
454,True,True,True,True,True,True,True,False,True
455,True,True,True,True,True,True,True,False,True
456,True,True,True,True,True,True,True,True,True


In [14]:
# sum method (sums each col vertically by default))
revenue.sum() # axis = 'index' or axis = 0 (default)

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [15]:
# sum method by row (sums each row horizontall)
revenue.sum(axis = 'columns') # alternative value is 1 for columns, 0 for rows

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

# Basic DataFrame Manipulations

In [16]:
# selecting a single column from a data frame (result is a series object)
nba.Team # will not work if col name has a space in it

# preferred method (result is also a series object)
nba['Name']

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [17]:
# selecting a single column and method chaining head
nba['Name'].head(n = 3)

0    Avery Bradley
1      Jae Crowder
2     John Holland
Name: Name, dtype: object

In [18]:
# selecting multiple columns (requries a list of column names, in the order you wish to display the dataframe)
nba[['Name', 'Team', 'Salary']].head()

Unnamed: 0,Name,Team,Salary
0,Avery Bradley,Boston Celtics,7730337.0
1,Jae Crowder,Boston Celtics,6796117.0
2,John Holland,Boston Celtics,
3,R.J. Hunter,Boston Celtics,1148640.0
4,Jonas Jerebko,Boston Celtics,5000000.0


In [19]:
# using a variable to select multiple names
cols = ['Name', 'Team', 'Salary']

nba[cols].head() # do not need to use double square brackets with this syntax

Unnamed: 0,Name,Team,Salary
0,Avery Bradley,Boston Celtics,7730337.0
1,Jae Crowder,Boston Celtics,6796117.0
2,John Holland,Boston Celtics,
3,R.J. Hunter,Boston Celtics,1148640.0
4,Jonas Jerebko,Boston Celtics,5000000.0


In [20]:
# add new column (using a constant value)
nba['Sport'] = 'Basketball'
nba['League'] = 'NBA'
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,NBA
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,NBA
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,NBA
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,NBA
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball,NBA


In [21]:
# re-import nba data as dataframe
nba = pd.read_csv('data/nba.csv')

# using the insert method to add new column (specify insert position with index)
nba.insert(loc = 2, column = 'Sport', value = 'Basketball')
nba.insert(loc = 3, column = 'League', value = 'NBA')
nba.head()

Unnamed: 0,Name,Team,Sport,League,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,Basketball,NBA,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,Basketball,NBA,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,Basketball,NBA,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,Basketball,NBA,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,Basketball,NBA,8.0,PF,29.0,6-10,231.0,,5000000.0


In [22]:
# re-import nba data as dataframe
nba = pd.read_csv('data/nba.csv')

# add new column using index method (using dynamic values based on another column)
nba.insert(
    loc = 5, 
    column = 'Age in 10 Yrs', 
    value = nba['Age'].add(10)
)

nba.head()

# alternate syntax (not preferred, less intuitive and you can't specify location)
# nba['Age in 20 Yrs'] = nba['Age'] + 20
# nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Age in 10 Yrs,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,35.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,35.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,37.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,32.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,39.0,6-10,231.0,,5000000.0


In [23]:
# re-import nba data as dataframe
nba = pd.read_csv('data/nba.csv')

# another example of adding a new column using the index method
nba.insert(
    loc = 7,
    column = 'Weight (kg)',
    value = nba['Weight'].multiply(0.4535) # can also use mul method
)

nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,81.63,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,106.5725,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,92.9675,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,83.8975,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,104.7585,,5000000.0


In [24]:
# value counts method on a specific column (distinct counts)
nba_position_counts = nba['Position'].value_counts()
nba_position_percent = nba['Position'].value_counts(normalize = True) * 100 # relative percentages

print(nba_position_counts)
print('\n')
print(nba_position_percent)

SG    102
PF    100
PG     92
SF     85
C      78
Name: Position, dtype: int64


SG    22.319475
PF    21.881838
PG    20.131291
SF    18.599562
C     17.067834
Name: Position, dtype: float64


In [25]:
# identifying counts of null values
nba.isna().sum()

Name            1
Team            1
Number          1
Position        1
Age             1
Height          1
Weight          1
Weight (kg)     1
College        85
Salary         12
dtype: int64

In [26]:
# drop ALL rows with a single null value
nba.dropna(how = 'any') # add inplace parameter to overwrite existing dataframe

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,81.6300,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,106.5725,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,83.8975,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,106.5725,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,107.9330,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,93.4210,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,93.4210,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,106.1190,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,92.0605,Butler,2433333.0


In [27]:
# drop rows with ALL null values (the last row in this example)
nba.dropna(how = 'all') # add inplace parameter to overwrite existing dataframe

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,81.6300,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,106.5725,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,92.9675,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,83.8975,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,104.7585,,5000000.0
...,...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,106.1190,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,92.0605,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,81.1765,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,116.0960,,2900000.0


In [28]:
# drop rows with a null value in a specified column(s)
nba.dropna(subset = ['College', 'Salary']) # add inplace parameter to overwrite existing dataframe

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,81.6300,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,106.5725,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,83.8975,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,106.5725,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,107.9330,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,93.4210,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,93.4210,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,106.1190,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,92.0605,Butler,2433333.0


In [29]:
# filling missing values
nba.fillna(0) # demonstration purposes

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,81.6300,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,106.5725,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,92.9675,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,83.8975,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,104.7585,0,5000000.0
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,92.0605,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,81.1765,0,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,116.0960,0,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,104.7585,Kansas,947276.0


In [30]:
# filling missing values for specified column(s)
nba['College'] = nba['College'].fillna('Unknown') # replacing all NaN values with strings may be helpful for a categorical variable
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,92.0605,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,81.1765,Unknown,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,116.096,Unknown,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,104.7585,Kansas,947276.0
457,,,,,,,,,Unknown,


In [31]:
# filling missing values for specified column(s) using a method
nba['Salary'] = nba['Salary'].fillna(nba['Salary'].mean())
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,81.63,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,106.5725,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,92.9675,Boston University,4842684.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,83.8975,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,104.7585,Unknown,5000000.0


In [32]:
# converting data types using astype (drop NaN row in nba data frame first)
nba_clean = nba.dropna(how = 'any')

# age is a floating point number because Pandas converted it due to a NaN value
# if you have a missing value in a numeric colum, Pandas will convert all to a floating point

# check for NaN
nba_clean['Age'].hasnans

False

In [33]:
# astype method
nba_clean['Age'] = nba_clean['Age'].astype(int) # int, float, str
nba_clean['Salary'] = nba_clean['Salary'].astype(int)
nba_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_clean['Age'] = nba_clean['Age'].astype(int) # int, float, str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_clean['Salary'] = nba_clean['Salary'].astype(int)


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25,6-2,180.0,81.63,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25,6-6,235.0,106.5725,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27,6-5,205.0,92.9675,Boston University,4842684
3,R.J. Hunter,Boston Celtics,28.0,SG,22,6-5,185.0,83.8975,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8.0,PF,29,6-10,231.0,104.7585,Unknown,5000000


In [34]:
# astype for categorical varible (no visual difference but reduces memory cost)
# check memory usage of dataframe before and after to see comparison
nba_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         457 non-null    object 
 1   Team         457 non-null    object 
 2   Number       457 non-null    float64
 3   Position     457 non-null    object 
 4   Age          457 non-null    int32  
 5   Height       457 non-null    object 
 6   Weight       457 non-null    float64
 7   Weight (kg)  457 non-null    float64
 8   College      457 non-null    object 
 9   Salary       457 non-null    int32  
dtypes: float64(3), int32(2), object(5)
memory usage: 35.7+ KB


In [35]:
# update position variable with astype for category
nba_clean['Position'] = nba['Position'].astype('category')
nba_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Name         457 non-null    object  
 1   Team         457 non-null    object  
 2   Number       457 non-null    float64 
 3   Position     457 non-null    category
 4   Age          457 non-null    int32   
 5   Height       457 non-null    object  
 6   Weight       457 non-null    float64 
 7   Weight (kg)  457 non-null    float64 
 8   College      457 non-null    object  
 9   Salary       457 non-null    int32   
dtypes: category(1), float64(3), int32(2), object(4)
memory usage: 32.8+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_clean['Position'] = nba['Position'].astype('category')


In [36]:
# apply same transformation to team variable
nba_clean['Team'] = nba_clean['Team'].astype('category')
nba_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Name         457 non-null    object  
 1   Team         457 non-null    category
 2   Number       457 non-null    float64 
 3   Position     457 non-null    category
 4   Age          457 non-null    int32   
 5   Height       457 non-null    object  
 6   Weight       457 non-null    float64 
 7   Weight (kg)  457 non-null    float64 
 8   College      457 non-null    object  
 9   Salary       457 non-null    int32   
dtypes: category(2), float64(3), int32(2), object(3)
memory usage: 30.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nba_clean['Team'] = nba_clean['Team'].astype('category')


In [37]:
# sorting values (default ascending)
# when calling on a series, Pandas knows which column to sort as there is only one
nba['Name'].sort_values()

152      Aaron Brooks
356      Aaron Gordon
328    Aaron Harrison
404     Adreian Payne
312        Al Horford
            ...      
270    Xavier Munford
402       Zach LaVine
271     Zach Randolph
237     Zaza Pachulia
457               NaN
Name: Name, Length: 458, dtype: object

In [38]:
# sorting values on a data frame
nba.sort_values(by = 'Name', ascending = True).head() # ascending is default but it is a best practice to be explicit

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,73.0135,Oregon,2250000.0
356,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,99.77,Arizona,4171680.0
328,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,95.235,Kentucky,525093.0
404,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,107.4795,Michigan State,1938840.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,111.1075,Florida,12000000.0


In [39]:
# sorting values on a data frame (descending)
nba.sort_values(by = 'Age', ascending = False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
304,Andre Miller,San Antonio Spurs,24.0,PG,40.0,6-3,200.0,90.7,Utah,250750.0
400,Kevin Garnett,Minnesota Timberwolves,21.0,PF,40.0,6-11,240.0,108.84,Unknown,8500000.0
298,Tim Duncan,San Antonio Spurs,21.0,C,40.0,6-11,250.0,113.375,Wake Forest,5250000.0
261,Vince Carter,Memphis Grizzlies,15.0,SG,39.0,6-6,220.0,99.77,North Carolina,4088019.0
102,Pablo Prigioni,Los Angeles Clippers,9.0,PG,39.0,6-3,185.0,83.8975,Unknown,947726.0


In [40]:
# sorting values on a data frame when there are NaN values
# re-import nba data frame to include NaN values that were previously removed
nba = pd.read_csv('data/nba.csv')
nba.sort_values(by = 'Salary', ascending = True, na_position = 'first').head() # na_position default is 'last'

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
46,Elton Brand,Philadelphia 76ers,42.0,PF,37.0,6-9,254.0,Duke,
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,
269,Ray McCallum,Memphis Grizzlies,5.0,PG,24.0,6-3,190.0,Detroit,


In [41]:
# sorting values on a data frame across multiple columns
# when sorting by multiple columns, we need to provide a list of col names
nba_clean.sort_values(by = ['Team', 'Name'], ascending = True) # acending parameter will apply to both col names specified

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
312,Al Horford,Atlanta Hawks,15.0,C,30,6-10,245.0,111.1075,Florida,12000000
318,Dennis Schroder,Atlanta Hawks,17.0,PG,22,6-1,172.0,78.0020,Unknown,1763400
323,Jeff Teague,Atlanta Hawks,0.0,PG,27,6-2,186.0,84.3510,Wake Forest,8000000
309,Kent Bazemore,Atlanta Hawks,24.0,SF,26,6-5,201.0,91.1535,Old Dominion,2000000
311,Kirk Hinrich,Atlanta Hawks,12.0,SG,35,6-4,190.0,86.1650,Kansas,2854940
...,...,...,...,...,...,...,...,...,...,...
381,Marcus Thornton,Washington Wizards,15.0,SF,29,6-4,205.0,92.9675,LSU,200600
376,Markieff Morris,Washington Wizards,5.0,PF,26,6-10,245.0,111.1075,Kansas,8000000
375,Nene Hilario,Washington Wizards,42.0,C,33,6-11,250.0,113.3750,Unknown,13000000
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23,6-8,198.0,89.7930,Georgetown,4662960


In [42]:
# sorting values on a data frame across multiple columns with different sort directions
# both lists must be the same length and will associate with the col names in order
# saving to original data frame name will overwrite and maintain sort order
nba_clean = nba_clean.sort_values(by = ['Team', 'Name'], ascending = [True, False])
nba_clean

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24,7-3,260.0,117.9100,Unknown,1000000
310,Tim Hardaway Jr.,Atlanta Hawks,10.0,SG,24,6-6,205.0,92.9675,Michigan,1304520
321,Tiago Splitter,Atlanta Hawks,11.0,C,31,6-11,245.0,111.1075,Unknown,9756250
320,Thabo Sefolosha,Atlanta Hawks,25.0,SF,32,6-7,220.0,99.7700,Unknown,4000000
315,Paul Millsap,Atlanta Hawks,4.0,PF,31,6-8,246.0,111.5610,Louisiana Tech,18671659
...,...,...,...,...,...,...,...,...,...,...
374,JJ Hickson,Washington Wizards,21.0,C,27,6-9,242.0,109.7470,North Carolina State,273038
380,Garrett Temple,Washington Wizards,17.0,SG,30,6-6,195.0,88.4325,LSU,1100602
372,Drew Gooden,Washington Wizards,90.0,PF,34,6-10,250.0,113.3750,Kansas,3300000
369,Bradley Beal,Washington Wizards,3.0,SG,22,6-5,207.0,93.8745,Florida,5694674


In [43]:
# sorting a dataframe by index (using the previously sorted/saved nba_clean data frame so the index values are out of order)
nba_clean.sort_index(axis = 'index', ascending = True) # axis = 'columns' would sort the column headers (if you're in to that sort of thing)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25,6-2,180.0,81.6300,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25,6-6,235.0,106.5725,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27,6-5,205.0,92.9675,Boston University,4842684
3,R.J. Hunter,Boston Celtics,28.0,SG,22,6-5,185.0,83.8975,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8.0,PF,29,6-10,231.0,104.7585,Unknown,5000000
...,...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20,6-10,234.0,106.1190,Kentucky,2239800
453,Shelvin Mack,Utah Jazz,8.0,PG,26,6-3,203.0,92.0605,Butler,2433333
454,Raul Neto,Utah Jazz,25.0,PG,24,6-1,179.0,81.1765,Unknown,900000
455,Tibor Pleiss,Utah Jazz,21.0,C,26,7-3,256.0,116.0960,Unknown,2900000


In [44]:
# ranking values with rank method (using nba_clean since NaN values have been removed)
nba_clean['Salary'].rank(ascending = False).astype(int) # result is the rank by row index position

322    353
310    313
321     73
320    184
315     16
      ... 
374    436
380    341
372    209
369    125
368    184
Name: Salary, Length: 457, dtype: int32

In [45]:
# appending rank column to nba_clean data frame
nba_clean['Salary Rank'] = nba_clean['Salary'].rank(ascending = False).astype(int)

# output top 20 salaries
nba_clean.sort_values(by = 'Salary Rank', ascending = True).head(n = 10) # rank will repeat rankings for value ties

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,Weight (kg),College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37,6-6,212.0,96.142,Unknown,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31,6-8,250.0,113.375,Unknown,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32,6-8,240.0,108.84,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30,6-11,265.0,120.1775,Unknown,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32,6-11,235.0,106.5725,Georgia Tech,22192730,5
100,Chris Paul,Los Angeles Clippers,3.0,PG,31,6-0,175.0,79.3625,Wake Forest,21468695,6
414,Kevin Durant,Oklahoma City Thunder,35.0,SF,27,6-9,240.0,108.84,Texas,20158622,7
164,Derrick Rose,Chicago Bulls,1.0,PG,27,6-3,190.0,86.165,Memphis,20093064,8
349,Dwyane Wade,Miami Heat,3.0,SG,34,6-4,220.0,99.77,Marquette,20000000,9
98,DeAndre Jordan,Los Angeles Clippers,6.0,C,27,6-11,265.0,120.1775,Texas A&M,19689000,11
