## INTRO TO DATAFRAMES

Two-dimensional, size-mutable, potentially heterogeneous tabular data.

Data structure also contains labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure.

In [2]:
import pandas as pd

In [7]:
nba=pd.read_csv('nba.csv')
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


### Methods and Attributes between Series and DataFrames

In [8]:
a=pd.Series([1,2,3,4,5,6])
a

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [9]:
a.head()

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [14]:
nba.head() #head and tail same as in series

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [11]:
nba.head(n=3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [12]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [13]:
nba.tail(1)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
457,,,,,,,,,


In [15]:
a.index

RangeIndex(start=0, stop=6, step=1)

In [16]:
nba.index #range object for index

RangeIndex(start=0, stop=458, step=1)

In [17]:
a.values

array([1, 2, 3, 4, 5, 6], dtype=int64)

In [18]:
nba.values #multdimensional array  or lists storing lists

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [19]:
a.shape #1dimensional

(6,)

In [20]:
nba.shape #458 rows and 9 cols

(458, 9)

In [21]:
a.dtype

dtype('int64')

In [23]:
nba.dtypes #for each col we get datatype (heterogenous data present)

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [26]:
a.hasnans #exclusive to series - tells about missing values

False

In [28]:
nba.hasnans #this attribute not available for data frame

AttributeError: 'DataFrame' object has no attribute 'hasnans'

In [29]:
a.columns #not present for series as series has only 1col

AttributeError: 'Series' object has no attribute 'columns'

In [30]:
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [31]:
a.axes

[RangeIndex(start=0, stop=6, step=1)]

In [32]:
nba.axes  #two indexes: two axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [33]:
a.info()

<class 'pandas.core.series.Series'>
RangeIndex: 6 entries, 0 to 5
Series name: None
Non-Null Count  Dtype
--------------  -----
6 non-null      int64
dtypes: int64(1)
memory usage: 176.0 bytes


In [34]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


### Difference between Shared Methods

In [35]:
pd.read_csv('revenue.csv')

Unnamed: 0,Date,New York,Los Angeles,Miami
0,1/1/16,985,122,499
1,1/2/16,738,788,534
2,1/3/16,14,20,933
3,1/4/16,730,904,885
4,1/5/16,114,71,253
5,1/6/16,936,502,497
6,1/7/16,123,996,115
7,1/8/16,935,492,886
8,1/9/16,846,954,823
9,1/10/16,54,285,216


In [36]:
# we want to set date col as index
revenue=pd.read_csv('revenue.csv',index_col='Date' )
revenue

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933
1/4/16,730,904,885
1/5/16,114,71,253
1/6/16,936,502,497
1/7/16,123,996,115
1/8/16,935,492,886
1/9/16,846,954,823
1/10/16,54,285,216


In [37]:
a.sum()

21

In [39]:
#df can have sum of col or of row
revenue.sum() #default behaviour is column 

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [43]:
revenue.sum(axis="index") #Axis for the function to be applied on.

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [44]:
revenue.sum(axis=0)

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [None]:
# if we want sum per row i.e. iterate row wise

In [40]:
revenue.sum(axis='columns') #column consideres as second axis

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [41]:
revenue.sum(axis=1)

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

### Select One column from a DataFrame

In [47]:
nba.Name #returns series

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [50]:
type(nba.Name) #dataframe is a bunch of series implemented under the hood

pandas.core.series.Series

In [51]:
nba.First Name # we cant type cols with space

SyntaxError: invalid syntax (2196836854.py, line 1)

In [53]:
nba['Name'] #this syntax works everytime

0      Avery Bradley
1        Jae Crowder
2       John Holland
3        R.J. Hunter
4      Jonas Jerebko
           ...      
453     Shelvin Mack
454        Raul Neto
455     Tibor Pleiss
456      Jeff Withey
457              NaN
Name: Name, Length: 458, dtype: object

In [54]:
nba['Salary'] #pay attention to case of letters

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [56]:
nba['salary'] #pay attention to case of letters

KeyError: 'salary'

In [57]:
nba['Salary'].head(7)

0     7730337.0
1     6796117.0
2           NaN
3     1148640.0
4     5000000.0
5    12000000.0
6     1170960.0
Name: Salary, dtype: float64

In [58]:
nba['Salary'].head(7).values #chaining

array([ 7730337.,  6796117.,       nan,  1148640.,  5000000., 12000000.,
        1170960.])

### Select Two or More Columns from a DataFrame

In [60]:
nba[['Name','Salary']] #we provide list of string 
#returns dataframe

Unnamed: 0,Name,Salary
0,Avery Bradley,7730337.0
1,Jae Crowder,6796117.0
2,John Holland,
3,R.J. Hunter,1148640.0
4,Jonas Jerebko,5000000.0
...,...,...
453,Shelvin Mack,2433333.0
454,Raul Neto,900000.0
455,Tibor Pleiss,2900000.0
456,Jeff Withey,947276.0


In [61]:
nba[['Team','Name']] #we want to display teams col first

Unnamed: 0,Team,Name
0,Boston Celtics,Avery Bradley
1,Boston Celtics,Jae Crowder
2,Boston Celtics,John Holland
3,Boston Celtics,R.J. Hunter
4,Boston Celtics,Jonas Jerebko
...,...,...
453,Utah Jazz,Shelvin Mack
454,Utah Jazz,Raul Neto
455,Utah Jazz,Tibor Pleiss
456,Utah Jazz,Jeff Withey


In [62]:
nba[['Salary','Team','Name']]

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland
3,1148640.0,Boston Celtics,R.J. Hunter
4,5000000.0,Boston Celtics,Jonas Jerebko
...,...,...,...
453,2433333.0,Utah Jazz,Shelvin Mack
454,900000.0,Utah Jazz,Raul Neto
455,2900000.0,Utah Jazz,Tibor Pleiss
456,947276.0,Utah Jazz,Jeff Withey


In [63]:
col_to_select=['Salary','Team','Name']
nba[col_to_select] 

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland
3,1148640.0,Boston Celtics,R.J. Hunter
4,5000000.0,Boston Celtics,Jonas Jerebko
...,...,...,...
453,2433333.0,Utah Jazz,Shelvin Mack
454,900000.0,Utah Jazz,Raul Neto
455,2900000.0,Utah Jazz,Tibor Pleiss
456,947276.0,Utah Jazz,Jeff Withey


### Add New Column to DataFrame

In [64]:
nba['Sport'] #this col doesnt exist will give error

KeyError: 'Sport'

In [65]:
nba['Sport']='Basketball' #modifies original dataframe

In [66]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball


In [67]:
nba['League']='National Basketball Association'

In [68]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basketball Association
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basketball Association
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basketball Association
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,National Basketball Association
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball,National Basketball Association


In [73]:
nba.insert(loc=3,column='newSport',value='BasketBall')
# nba.insert(
#     loc: 'int',
#     column: 'Hashable',
#     value: 'Scalar | AnyArrayLike',
#     allow_duplicates: 'bool | lib.NoDefault' = <no_default>,
# ) -> 'None'
# Docstring:
# Insert column into DataFrame at specified location.


ValueError: cannot insert newSport, already exists

In [74]:
nba.head()

Unnamed: 0,Name,Team,Number,newSport,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,BasketBall,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basketball Association
1,Jae Crowder,Boston Celtics,99.0,BasketBall,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basketball Association
2,John Holland,Boston Celtics,30.0,BasketBall,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basketball Association
3,R.J. Hunter,Boston Celtics,28.0,BasketBall,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,National Basketball Association
4,Jonas Jerebko,Boston Celtics,8.0,BasketBall,PF,29.0,6-10,231.0,,5000000.0,Basketball,National Basketball Association


In [76]:
nba.insert(loc=7, column='Newleague', value='National Basketball Association')
nba.head()

Unnamed: 0,Name,Team,Number,newSport,Position,Age,Height,Newleague,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,BasketBall,PG,25.0,6-2,National Basketball Association,180.0,Texas,7730337.0,Basketball,National Basketball Association
1,Jae Crowder,Boston Celtics,99.0,BasketBall,SF,25.0,6-6,National Basketball Association,235.0,Marquette,6796117.0,Basketball,National Basketball Association
2,John Holland,Boston Celtics,30.0,BasketBall,SG,27.0,6-5,National Basketball Association,205.0,Boston University,,Basketball,National Basketball Association
3,R.J. Hunter,Boston Celtics,28.0,BasketBall,SG,22.0,6-5,National Basketball Association,185.0,Georgia State,1148640.0,Basketball,National Basketball Association
4,Jonas Jerebko,Boston Celtics,8.0,BasketBall,PF,29.0,6-10,National Basketball Association,231.0,,5000000.0,Basketball,National Basketball Association


In [78]:
nba.DotLeague='new basketball association' #this doesn't works

In [79]:
nba.head()

Unnamed: 0,Name,Team,Number,newSport,Position,Age,Height,Newleague,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,BasketBall,PG,25.0,6-2,National Basketball Association,180.0,Texas,7730337.0,Basketball,new basketball association
1,Jae Crowder,Boston Celtics,99.0,BasketBall,SF,25.0,6-6,National Basketball Association,235.0,Marquette,6796117.0,Basketball,new basketball association
2,John Holland,Boston Celtics,30.0,BasketBall,SG,27.0,6-5,National Basketball Association,205.0,Boston University,,Basketball,new basketball association
3,R.J. Hunter,Boston Celtics,28.0,BasketBall,SG,22.0,6-5,National Basketball Association,185.0,Georgia State,1148640.0,Basketball,new basketball association
4,Jonas Jerebko,Boston Celtics,8.0,BasketBall,PF,29.0,6-10,National Basketball Association,231.0,,5000000.0,Basketball,new basketball association


### Create New Column From Existing Column

In [80]:
nba_df=pd.read_csv('nba.csv')

In [86]:
nba_df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [87]:
nba_df['Age']+10 ##added 10 to each age

0      35.0
1      35.0
2      37.0
3      32.0
4      39.0
       ... 
453    36.0
454    34.0
455    36.0
456    36.0
457     NaN
Name: Age, Length: 458, dtype: float64

In [88]:
nba_df['Age in a Decade']=nba['Age']+10

In [89]:
nba_df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Age in a Decade
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,35.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,37.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,39.0


In [90]:
nba_df['Salary']

0      7730337.0
1      6796117.0
2            NaN
3      1148640.0
4      5000000.0
         ...    
453    2433333.0
454     900000.0
455    2900000.0
456     947276.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [91]:
nba_df['new salary']=nba_df['Salary'].sub(500)
nba_df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Age in a Decade,new salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,35.0,7729837.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0,6795617.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,37.0,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0,1148140.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,39.0,4999500.0


In [93]:
#convert weights from pounds to kgs
#nba_df['Weight'].mul(0.4535) or nba['Weight']*04535
nba_df['Weight in Kgs']=nba_df['Weight'].mul(0.4535)
#we can even overwrite existing columns
#nba_df['Weight']=nba_df['Weight'].mul(0.4535) #this works well
nba_df.head() 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Age in a Decade,new salary,Weight in Kgs
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,35.0,7729837.0,81.63
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,35.0,6795617.0,106.5725
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,37.0,,92.9675
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,32.0,1148140.0,83.8975
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,39.0,4999500.0,104.7585


### A Review of the value_counts Method

In [94]:
nba.head()

Unnamed: 0,Name,Team,Number,newSport,Position,Age,Height,Newleague,Weight,College,Salary,Sport,League,Age in a Decade
0,Avery Bradley,Boston Celtics,0.0,BasketBall,PG,25.0,6-2,National Basketball Association,180.0,Texas,7730337.0,Basketball,new basketball association,35.0
1,Jae Crowder,Boston Celtics,99.0,BasketBall,SF,25.0,6-6,National Basketball Association,235.0,Marquette,6796117.0,Basketball,new basketball association,35.0
2,John Holland,Boston Celtics,30.0,BasketBall,SG,27.0,6-5,National Basketball Association,205.0,Boston University,,Basketball,new basketball association,37.0
3,R.J. Hunter,Boston Celtics,28.0,BasketBall,SG,22.0,6-5,National Basketball Association,185.0,Georgia State,1148640.0,Basketball,new basketball association,32.0
4,Jonas Jerebko,Boston Celtics,8.0,BasketBall,PF,29.0,6-10,National Basketball Association,231.0,,5000000.0,Basketball,new basketball association,39.0


In [101]:
nba.drop(['Age in a Decade'],axis=1,inplace=True)

In [102]:
nba.drop(['newSport','Newleague','League','Sport'],axis=1,inplace=True)


In [103]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [105]:
nba.value_counts() #useless like this

Name                    Team                   Number  Position  Age   Height  Weight  College         Salary    
Aaron Brooks            Chicago Bulls          0.0     PG        31.0  6-0     161.0   Oregon          2250000.0     1
Mike Muscala            Atlanta Hawks          31.0    PF        24.0  6-11    240.0   Bucknell        947276.0      1
Mike Dunleavy           Chicago Bulls          34.0    SG        35.0  6-9     230.0   Duke            4500000.0     1
Mike Conley             Memphis Grizzlies      11.0    PG        28.0  6-1     175.0   Ohio State      9588426.0     1
Michael Kidd-Gilchrist  Charlotte Hornets      14.0    SF        22.0  6-7     232.0   Kentucky        6331404.0     1
                                                                                                                    ..
Hassan Whiteside        Miami Heat             21.0    C         26.0  7-0     265.0   Marshall        981348.0      1
Harrison Barnes         Golden State Warriors  40.0  

In [106]:
nba['Position'].value_counts() #for a particular column

SG    102
PF    100
PG     92
SF     85
C      78
Name: Position, dtype: int64

In [107]:
nba['Position'].value_counts(normalize=True)

SG    0.223195
PF    0.218818
PG    0.201313
SF    0.185996
C     0.170678
Name: Position, dtype: float64

In [108]:
nba['Position'].value_counts(normalize=True)*100

SG    22.319475
PF    21.881838
PG    20.131291
SF    18.599562
C     17.067834
Name: Position, dtype: float64

In [109]:
nba['Salary'].value_counts()

947276.0      31
845059.0      18
525093.0      13
981348.0       6
16407500.0     5
              ..
2100000.0      1
1252440.0      1
2891760.0      1
3272091.0      1
900000.0       1
Name: Salary, Length: 309, dtype: int64

# Drop Rows With Null Values

In [111]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [112]:
#last row is all empty
#we need to remove this
nba.dropna() #delete Nan values , remove rows with missing values
# removes any row having any colmn having missing value

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [114]:
nba.dropna(how='any') #any row has any col with missing values

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [115]:
# we want to be stricter we want to remove rows having all cols values as nan
nba.dropna(how='all')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0


In [116]:
#somewhere in between
#we want to remove rows having missing value in College Column
nba.dropna(subset=['College']) #drops rows having Nan in college column


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [117]:
#for two cols- 
nba.dropna(subset=['College','Salary'])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [118]:
#dropna returns new dataframe- we can store that, doesnt modify original dataframe

### FIll in Missing DataFrame Values with the Fillna Method

In [119]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [120]:
nba.fillna(0)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,0,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,0,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [123]:
#lesser the missing values better the analysis
nba['College'].fillna('Unknown') #change isnt inplace i.e. a new dataframe is returned that can be stored
#to modify original df do inplace=True



0                  Texas
1              Marquette
2      Boston University
3          Georgia State
4                Unknown
             ...        
453               Butler
454              Unknown
455              Unknown
456               Kansas
457              Unknown
Name: College, Length: 458, dtype: object

In [124]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [125]:
# inplace is not recommended,to us it appears original df is modified under the hood pandas is still creating a copy
#hence memory wastage is there

In [126]:
nba['Salary'].fillna(value=0,inplace=True)

In [127]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,0.0


In [133]:
nba['Salary']=nba['Salary'].fillna(value=0)

In [134]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,0.0


### The astype Method

In [142]:
nba_new=pd.read_csv('nba.csv').dropna(how='all')
nba_new.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [146]:
nba_new['Age'].hasnans #if the col has missing values any type of conversion will fail

False

In [148]:
nba_new['Age'].astype('int') #this hasn't modified original df till now

0      25
1      25
2      27
3      22
4      29
       ..
452    20
453    26
454    24
455    26
456    26
Name: Age, Length: 457, dtype: int32

In [150]:
nba_new['Age'].astype(int) #this also works as int in python is a reserved keyword

0      25
1      25
2      27
3      22
4      29
       ..
452    20
453    26
454    24
455    26
456    26
Name: Age, Length: 457, dtype: int32

In [151]:
nba_new['Age'].tail()

452    20.0
453    26.0
454    24.0
455    26.0
456    26.0
Name: Age, dtype: float64

In [153]:
# in nba salaray col there is no missing values
print(nba['Salary'].hasnans)
nba['Salary'].astype(int).astype('str')

False


0      7730337
1      6796117
2            0
3      1148640
4      5000000
        ...   
453    2433333
454     900000
455    2900000
456     947276
457          0
Name: Salary, Length: 458, dtype: object

### The astype Method II

In [154]:
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [158]:
len(nba_new)

457

In [160]:
nba_new['Position'].unique() #only 5 unique values
#these are repeated again and again
#this can be made as category column as it significantly reduces memory consumption
#what pandas does under the hood is rather than replicating or duplicating values over and over again
#it simply creates one copy of each unique value and then references that one unique copy 
#each time that value is referenced in a given column
#there is no visual difference when we're looking at the dataframe but behind the scenes pandas employs memory cutting initiative and basically allows to reduce how much memory the dataset takes up on your computer.

array(['PG', 'SF', 'SG', 'PF', 'C'], dtype=object)

In [161]:
nba_new['Position'].nunique() 

5

In [164]:
nba_new.info() #memory usage

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [165]:
nba_new['Position'].astype('category')

0      PG
1      SF
2      SG
3      SG
4      PF
       ..
452    PF
453    PG
454    PG
455     C
456     C
Name: Position, Length: 457, dtype: category
Categories (5, object): ['C', 'PF', 'PG', 'SF', 'SG']

In [166]:
nba_new['Position']=nba_new['Position'].astype('category')

In [167]:
nba_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    object  
 2   Number    457 non-null    float64 
 3   Position  457 non-null    category
 4   Age       457 non-null    float64 
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   373 non-null    object  
 8   Salary    446 non-null    float64 
dtypes: category(1), float64(4), object(4)
memory usage: 32.8+ KB


In [169]:
nba['Team'].nunique()

30

In [170]:
nba['Team']=nba['Team'].astype('category')

In [171]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    category
 2   Number    457 non-null    float64 
 3   Position  457 non-null    object  
 4   Age       457 non-null    float64 
 5   Height    457 non-null    object  
 6   Weight    457 non-null    float64 
 7   College   373 non-null    object  
 8   Salary    458 non-null    float64 
dtypes: category(1), float64(4), object(4)
memory usage: 30.5+ KB


### Sort a DataFrame with the sort_values Method I

In [172]:
nba['Name'].sort_values() #sorts in ascending order

152      Aaron Brooks
356      Aaron Gordon
328    Aaron Harrison
404     Adreian Payne
312        Al Horford
            ...      
270    Xavier Munford
402       Zach LaVine
271     Zach Randolph
237     Zaza Pachulia
457               NaN
Name: Name, Length: 458, dtype: object

In [173]:
nba['Salary'].sort_values() #same as sorting series as single colmn sorted


457           0.0
2             0.0
397           0.0
171           0.0
409           0.0
          ...    
339    22192730.0
251    22359364.0
33     22875000.0
169    22970500.0
109    25000000.0
Name: Salary, Length: 458, dtype: float64

In [174]:
#to sort the df
nba.sort_values(by='Name') #by default ascending

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
356,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,Arizona,4171680.0
328,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,Kentucky,525093.0
404,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,Michigan State,1938840.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
...,...,...,...,...,...,...,...,...,...
270,Xavier Munford,Memphis Grizzlies,14.0,PG,24.0,6-3,180.0,Rhode Island,0.0
402,Zach LaVine,Minnesota Timberwolves,8.0,PG,21.0,6-5,189.0,UCLA,2148360.0
271,Zach Randolph,Memphis Grizzlies,50.0,PF,34.0,6-9,260.0,Michigan State,9638555.0
237,Zaza Pachulia,Dallas Mavericks,27.0,C,32.0,6-11,275.0,,5200000.0


In [175]:
nba.sort_values(by='Name',ascending=False) #sorts in descending order

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
237,Zaza Pachulia,Dallas Mavericks,27.0,C,32.0,6-11,275.0,,5200000.0
271,Zach Randolph,Memphis Grizzlies,50.0,PF,34.0,6-9,260.0,Michigan State,9638555.0
402,Zach LaVine,Minnesota Timberwolves,8.0,PG,21.0,6-5,189.0,UCLA,2148360.0
270,Xavier Munford,Memphis Grizzlies,14.0,PG,24.0,6-3,180.0,Rhode Island,0.0
386,Wilson Chandler,Denver Nuggets,21.0,SF,29.0,6-8,225.0,DePaul,10449438.0
...,...,...,...,...,...,...,...,...,...
404,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,Michigan State,1938840.0
328,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,Kentucky,525093.0
356,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,Arizona,4171680.0
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0


In [176]:
nba.sort_values(by='Age')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
226,Rashad Vaughn,Milwaukee Bucks,20.0,SG,19.0,6-6,202.0,UNLV,1733040.0
122,Devin Booker,Phoenix Suns,1.0,SG,19.0,6-6,206.0,Kentucky,2127840.0
40,Kristaps Porzingis,New York Knicks,6.0,PF,20.0,7-3,240.0,,4131720.0
401,Tyus Jones,Minnesota Timberwolves,1.0,PG,20.0,6-2,195.0,Duke,1282080.0
427,Cliff Alexander,Portland Trail Blazers,34.0,PF,20.0,6-8,240.0,Kansas,525093.0
...,...,...,...,...,...,...,...,...,...
102,Pablo Prigioni,Los Angeles Clippers,9.0,PG,39.0,6-3,185.0,,947726.0
298,Tim Duncan,San Antonio Spurs,21.0,C,40.0,6-11,250.0,Wake Forest,5250000.0
400,Kevin Garnett,Minnesota Timberwolves,21.0,PF,40.0,6-11,240.0,,8500000.0
304,Andre Miller,San Antonio Spurs,24.0,PG,40.0,6-3,200.0,Utah,250750.0


In [177]:
#nan at last by default, if we want nan at first
nba.sort_values('Salary',na_position='first')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
457,,,,,,,,,0.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
397,Axel Toupane,Denver Nuggets,6.0,SG,23.0,6-7,210.0,,0.0
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,0.0
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,0.0
...,...,...,...,...,...,...,...,...,...
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730.0
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364.0
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500.0


In [178]:
nba.sort_values('Salary',na_position='first',ascending=False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000.0
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500.0
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364.0
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730.0
...,...,...,...,...,...,...,...,...,...
269,Ray McCallum,Memphis Grizzlies,5.0,PG,24.0,6-3,190.0,Detroit,0.0
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,0.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,0.0


### Sort a DataFrame with the sort_values Method II

In [179]:
# we want to sort df first by team and then by name

In [180]:
nba.sort_values(by=['Team','Name'])
# first sorted by team , then for every identical team value name is sorted 


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
318,Dennis Schroder,Atlanta Hawks,17.0,PG,22.0,6-1,172.0,,1763400.0
323,Jeff Teague,Atlanta Hawks,0.0,PG,27.0,6-2,186.0,Wake Forest,8000000.0
309,Kent Bazemore,Atlanta Hawks,24.0,SF,26.0,6-5,201.0,Old Dominion,2000000.0
311,Kirk Hinrich,Atlanta Hawks,12.0,SG,35.0,6-4,190.0,Kansas,2854940.0
...,...,...,...,...,...,...,...,...,...
376,Markieff Morris,Washington Wizards,5.0,PF,26.0,6-10,245.0,Kansas,8000000.0
375,Nene Hilario,Washington Wizards,42.0,C,33.0,6-11,250.0,,13000000.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
379,Ramon Sessions,Washington Wizards,7.0,PG,30.0,6-3,190.0,Nevada,2170465.0


In [181]:
nba.sort_values(by=['Team','Name'],ascending=False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
379,Ramon Sessions,Washington Wizards,7.0,PG,30.0,6-3,190.0,Nevada,2170465.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
375,Nene Hilario,Washington Wizards,42.0,C,33.0,6-11,250.0,,13000000.0
376,Markieff Morris,Washington Wizards,5.0,PF,26.0,6-10,245.0,Kansas,8000000.0
381,Marcus Thornton,Washington Wizards,15.0,SF,29.0,6-4,205.0,LSU,200600.0
...,...,...,...,...,...,...,...,...,...
309,Kent Bazemore,Atlanta Hawks,24.0,SF,26.0,6-5,201.0,Old Dominion,2000000.0
323,Jeff Teague,Atlanta Hawks,0.0,PG,27.0,6-2,186.0,Wake Forest,8000000.0
318,Dennis Schroder,Atlanta Hawks,17.0,PG,22.0,6-1,172.0,,1763400.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0


In [182]:
#if we want to sort team in ascending order but name in descending
nba.sort_values(by=['Team','Name'],ascending=[True,False])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24.0,7-3,260.0,,1000000.0
310,Tim Hardaway Jr.,Atlanta Hawks,10.0,SG,24.0,6-6,205.0,Michigan,1304520.0
321,Tiago Splitter,Atlanta Hawks,11.0,C,31.0,6-11,245.0,,9756250.0
320,Thabo Sefolosha,Atlanta Hawks,25.0,SF,32.0,6-7,220.0,,4000000.0
315,Paul Millsap,Atlanta Hawks,4.0,PF,31.0,6-8,246.0,Louisiana Tech,18671659.0
...,...,...,...,...,...,...,...,...,...
380,Garrett Temple,Washington Wizards,17.0,SG,30.0,6-6,195.0,LSU,1100602.0
372,Drew Gooden,Washington Wizards,90.0,PF,34.0,6-10,250.0,Kansas,3300000.0
369,Bradley Beal,Washington Wizards,3.0,SG,22.0,6-5,207.0,Florida,5694674.0
368,Alan Anderson,Washington Wizards,6.0,SG,33.0,6-6,220.0,Michigan State,4000000.0


In [183]:
nba_new.sort_values(by=['Position','Salary'],ascending=[True,True])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
135,Alan Williams,Phoenix Suns,15.0,C,23.0,6-8,260.0,UC Santa Barbara,83397.0
420,Nazr Mohammed,Oklahoma City Thunder,13.0,C,38.0,6-10,250.0,Kentucky,222888.0
374,JJ Hickson,Washington Wizards,21.0,C,27.0,6-9,242.0,North Carolina State,273038.0
235,Salah Mejri,Dallas Mavericks,50.0,C,29.0,7-2,245.0,,525093.0
107,Tarik Black,Los Angeles Lakers,28.0,C,24.0,6-9,250.0,Kansas,845059.0
...,...,...,...,...,...,...,...,...,...
233,Wesley Matthews,Dallas Mavericks,23.0,SG,29.0,6-5,220.0,Marquette,16407500.0
349,Dwyane Wade,Miami Heat,3.0,SG,34.0,6-4,220.0,Marquette,20000000.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,


In [184]:
nba_new.sort_values(by=['Position','Salary'],ascending=[False,True])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
291,Orlando Johnson,New Orleans Pelicans,0.0,SG,27.0,6-5,220.0,UC Santa Barbara,55722.0
175,Jordan McRae,Cleveland Cavaliers,12.0,SG,25.0,6-5,179.0,Tennessee,111196.0
21,Sean Kilpatrick,Brooklyn Nets,6.0,SG,26.0,6-4,219.0,Cincinnati,134215.0
45,Tony Wroten,New York Knicks,5.0,SG,23.0,6-6,205.0,Washington,167406.0
282,Bryce Dejean-Jones,New Orleans Pelicans,31.0,SG,23.0,6-6,203.0,Iowa State,169883.0
...,...,...,...,...,...,...,...,...,...
418,Enes Kanter,Oklahoma City Thunder,11.0,C,24.0,6-11,245.0,Kentucky,16407500.0
265,Marc Gasol,Memphis Grizzlies,33.0,C,31.0,7-1,255.0,,19688000.0
23,Brook Lopez,Brooklyn Nets,11.0,C,28.0,7-0,275.0,Stanford,19689000.0
98,DeAndre Jordan,Los Angeles Clippers,6.0,C,27.0,6-11,265.0,Texas A&M,19689000.0


### Sort a DataFrame by its Index

In [185]:
nba=nba.sort_values(['Team','Name'])

In [186]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
318,Dennis Schroder,Atlanta Hawks,17.0,PG,22.0,6-1,172.0,,1763400.0
323,Jeff Teague,Atlanta Hawks,0.0,PG,27.0,6-2,186.0,Wake Forest,8000000.0
309,Kent Bazemore,Atlanta Hawks,24.0,SF,26.0,6-5,201.0,Old Dominion,2000000.0
311,Kirk Hinrich,Atlanta Hawks,12.0,SG,35.0,6-4,190.0,Kansas,2854940.0


In [189]:
nba.sort_index() #inplace parameter to sort in place

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [188]:
nba.sort_index(ascending=False) 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
457,,,,,,,,,0.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
...,...,...,...,...,...,...,...,...,...
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0


### Rank Values with the rank Method

In [204]:
# we want to assign highest salary as rank1 and second highest as rank2
# and so on ...
nba_new['Salary']=nba_new['Salary'].fillna(0).astype(int)

In [205]:
nba_new.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000


In [206]:
nba_new['Salary'].rank()

0      361.0
1      348.0
2        6.0
3      136.0
4      311.0
       ...  
452    208.0
453    217.0
454     75.0
455    243.5
456     91.0
Name: Salary, Length: 457, dtype: float64

In [211]:
#lowest salary rank1 and highest rank largest
nba_new['Salary'].rank(ascending=True)

0      361.0
1      348.0
2        6.0
3      136.0
4      311.0
       ...  
452    208.0
453    217.0
454     75.0
455    243.5
456     91.0
Name: Salary, Length: 457, dtype: float64

In [212]:
#we want to assign highest salary as rank1
nba_new['Salary'].rank(ascending=False).astype('int')

0       97
1      110
2      452
3      322
4      147
      ... 
452    250
453    241
454    383
455    214
456    367
Name: Salary, Length: 457, dtype: int32

In [213]:
nba_new['Salary rank']=nba_new['Salary'].rank(ascending=False).astype('int')

In [214]:
nba_new.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary rank
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337,97
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117,110
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0,452
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640,322
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000,147


In [218]:
nba_new.sort_values('Salary',ascending=False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730,5
...,...,...,...,...,...,...,...,...,...,...
353,Dorell Wright,Miami Heat,11.0,SF,30.0,6-9,205.0,,0,452
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,0,452
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,0,452
273,Alex Stepheson,Memphis Grizzlies,35.0,PF,28.0,6-10,270.0,USC,0,452
