# Frequency Distributions

### Exploring the Data

The data set is about basketball players in WNBA (Women's National Basketball Association), and contains general information about players, along with their metrics for the season 2016-2017.

In [5]:
import pandas as pd
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50

In [6]:
wnba = pd.read_csv('wnba.csv')

In [7]:
wnba.shape

(143, 32)

In [9]:
wnba

Unnamed: 0,Name,Team,Pos,Height,Weight,BMI,Birth_Place,Birthdate,Age,College,Experience,Games Played,MIN,FGM,FGA,FG%,15:00,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TO,PTS,DD2,TD3
0,Aerial Powers,DAL,F,183,71.0,21.200991,US,"January 17, 1994",23,Michigan State,2,8,173,30,85,35.3,12,32,37.5,21,26,80.8,6,22,28,12,3,6,12,93,0,0
1,Alana Beard,LA,G/F,185,73.0,21.329438,US,"May 14, 1982",35,Duke,12,30,947,90,177,50.8,5,18,27.8,32,41,78.0,19,82,101,72,63,13,40,217,0,0
2,Alex Bentley,CON,G,170,69.0,23.875433,US,"October 27, 1990",26,Penn State,4,26,617,82,218,37.6,19,64,29.7,35,42,83.3,4,36,40,78,22,3,24,218,0,0
3,Alex Montgomery,SAN,G/F,185,84.0,24.543462,US,"December 11, 1988",28,Georgia Tech,6,31,721,75,195,38.5,21,68,30.9,17,21,81.0,35,134,169,65,20,10,38,188,2,0
4,Alexis Jones,MIN,G,175,78.0,25.469388,US,"August 5, 1994",23,Baylor,R,24,137,16,50,32.0,7,20,35.0,11,12,91.7,3,9,12,12,7,0,14,50,0,0
5,Alexis Peterson,SEA,G,170,63.0,21.799308,US,"June 20, 1995",22,Syracuse,R,14,90,9,34,26.5,2,9,22.2,6,6,100.0,3,13,16,11,5,0,11,26,0,0
6,Alexis Prince,PHO,G,188,81.0,22.91761,US,"February 5, 1994",23,Baylor,R,16,112,9,34,26.5,4,15,26.7,2,2,100.0,1,14,15,5,4,3,3,24,0,0
7,Allie Quigley,CHI,G,178,64.0,20.19947,US,"June 20, 1986",31,DePaul,8,26,847,166,319,52.0,70,150,46.7,40,46,87.0,9,83,92,95,20,13,59,442,0,0
8,Allisha Gray,DAL,G,185,76.0,22.20599,US,"October 20, 1992",24,South Carolina,2,30,834,131,346,37.9,29,103,28.2,104,129,80.6,52,75,127,40,47,19,37,395,0,0
9,Allison Hightower,WAS,G,178,77.0,24.302487,US,"June 4, 1988",29,LSU,5,7,103,14,38,36.8,2,11,18.2,6,6,100.0,3,7,10,10,5,0,2,36,0,0


### Frequency Distribution Tables

In [12]:
freq_distro_pos = wnba['Pos'].value_counts()
freq_distro_pos

G      60
F      33
C      25
G/F    13
F/C    12
Name: Pos, dtype: int64

In [13]:
freq_distro_height = wnba['Height'].value_counts()
freq_distro_height

188    20
193    18
175    16
185    15
191    11
183    11
173    11
196     9
178     8
180     7
170     6
198     5
201     2
168     2
206     1
165     1
Name: Height, dtype: int64

In [14]:
wnba['Height'].value_counts().sort_index()

165     1
168     2
170     6
173    11
175    16
178     8
180     7
183    11
185    15
188    20
191    11
193    18
196     9
198     5
201     2
206     1
Name: Height, dtype: int64

In [15]:
wnba['Height'].value_counts().sort_index(ascending = False)

206     1
201     2
198     5
196     9
193    18
191    11
188    20
185    15
183    11
180     7
178     8
175    16
173    11
170     6
168     2
165     1
Name: Height, dtype: int64

In [17]:
age_ascending = wnba['Age'].value_counts().sort_index()
age_ascending

21     2
22    10
23    15
24    16
25    15
26    12
27    13
28    14
29     8
30     9
31     8
32     8
33     3
34     5
35     4
36     1
Name: Age, dtype: int64

In [18]:
age_descending = wnba['Age'].value_counts().sort_index(ascending=False)
age_descending

36     1
35     4
34     5
33     3
32     8
31     8
30     9
29     8
28    14
27    13
26    12
25    15
24    16
23    15
22    10
21     2
Name: Age, dtype: int64

### Sorting tables for ordinal variables

In [20]:
def make_pts_ordinal(row):
    if row['PTS'] <= 20:
        return 'very few points'
    if (20 < row['PTS'] <=  80):
        return 'few points'
    if (80 < row['PTS'] <=  150):
        return 'many, but below average'
    if (150 < row['PTS'] <= 300):
        return 'average number of points'
    if (300 < row['PTS'] <=  450):
        return 'more than average'
    else:
        return 'much more than average'

In [21]:
wnba['PTS_ordinal_scale'] = wnba.apply(make_pts_ordinal, axis = 1)

In [24]:
wnba['PTS_ordinal_scale'].head(10)

0     many, but below average
1    average number of points
2    average number of points
3    average number of points
4                  few points
5                  few points
6                  few points
7           more than average
8           more than average
9                  few points
Name: PTS_ordinal_scale, dtype: object

In [27]:
wnba['PTS_ordinal_scale'].value_counts()

average number of points    45
few points                  27
many, but below average     25
more than average           21
much more than average      13
very few points             12
Name: PTS_ordinal_scale, dtype: int64

In [28]:
pts_ordinal_desc = wnba['PTS_ordinal_scale'].value_counts().iloc[[4, 3, 0, 2, 1, 5]]
pts_ordinal_desc

much more than average      13
more than average           21
average number of points    45
many, but below average     25
few points                  27
very few points             12
Name: PTS_ordinal_scale, dtype: int64

### Relative Frequencies

In [34]:
# Proportions

wnba['Pos'].value_counts() / len(wnba)

G      0.419580
F      0.230769
C      0.174825
G/F    0.090909
F/C    0.083916
Name: Pos, dtype: float64

In [35]:
wnba['Pos'].value_counts(normalize = True)

G      0.419580
F      0.230769
C      0.174825
G/F    0.090909
F/C    0.083916
Name: Pos, dtype: float64

In [36]:
# Percentages

wnba['Pos'].value_counts(normalize = True) * 100

G      41.958042
F      23.076923
C      17.482517
G/F     9.090909
F/C     8.391608
Name: Pos, dtype: float64

In [37]:
age_percentage = wnba['Age'].value_counts(normalize = True).sort_index() * 100
age_percentage

21     1.398601
22     6.993007
23    10.489510
24    11.188811
25    10.489510
26     8.391608
27     9.090909
28     9.790210
29     5.594406
30     6.293706
31     5.594406
32     5.594406
33     2.097902
34     3.496503
35     2.797203
36     0.699301
Name: Age, dtype: float64

In [41]:
percentage_25 = age_percentage[25].round(2)
percentage_30 = age_percentage[30].round(2)
percentage_over_30 = age_percentage.loc[30:].round(2)
percentage_below_23 = age_percentage.loc[:23].round(2)

In [42]:
print(percentage_25)
print(percentage_30)
print(percentage_over_30)
print(percentage_below_23)

10.49
6.29
30    6.29
31    5.59
32    5.59
33    2.10
34    3.50
35    2.80
36    0.70
Name: Age, dtype: float64
21     1.40
22     6.99
23    10.49
Name: Age, dtype: float64


### Percentile Rank

A percentile rank of a value x in a frequency distribution is given by the percentage of values that are equal or less than x.
If a value x is the 19th percentile, it means that 19% of all the values in the distribution are equal to or less than x.

In [43]:
from scipy.stats import percentileofscore

In [45]:
# kind = 'weak' indicates that we want to find the percentage of values thar are equal to or less than the value we specify in the score parameter

percentileofscore(a = wnba['Age'], score = 23, kind = 'weak')

18.88111888111888

In [46]:
percentile_rank_half_less = percentileofscore(a=wnba['Games Played'], score=17, kind='weak')
percentile_rank_half_less

16.083916083916083

In [47]:
percentage_half_more = 100-percentile_rank_half_less
percentage_half_more

83.91608391608392

In [48]:
wnba['Age'].describe()

count    143.000000
mean      27.076923
std        3.679170
min       21.000000
25%       24.000000
50%       27.000000
75%       30.000000
max       36.000000
Name: Age, dtype: float64

In [49]:
wnba['Age'].describe().iloc[3:]

min    21.0
25%    24.0
50%    27.0
75%    30.0
max    36.0
Name: Age, dtype: float64

The three percentiles that divide the distribution in four equal parts are also known as **quartiles**.

In [50]:
wnba['Age'].describe(percentiles = [.1, .15, .33, .5, .592, .85, .9]).iloc[3:]

min      21.0
10%      23.0
15%      23.0
33%      25.0
50%      27.0
59.2%    28.0
85%      31.0
90%      32.0
max      36.0
Name: Age, dtype: float64

In [52]:
age_percentiles = wnba['Age'].describe(percentiles=[.5,.75,.95])
age_upper_quartile = age_percentiles['75%']
age_middle_quartile = age_percentiles['50%']
age_95th_percentile = age_percentiles['95%']

In [53]:
print(age_upper_quartile)
print(age_middle_quartile)
print(age_95th_percentile)

30.0
27.0
34.0


### Grouped Frequency Distribution Tables

In [54]:
wnba['Weight'].value_counts().sort_index()

55.0      1
57.0      1
58.0      1
59.0      2
62.0      1
63.0      3
64.0      5
65.0      4
66.0      8
67.0      1
68.0      2
69.0      2
70.0      3
71.0      2
73.0      6
74.0      4
75.0      4
76.0      4
77.0     10
78.0      5
79.0      6
80.0      3
81.0      5
82.0      4
83.0      4
84.0      9
85.0      2
86.0      7
87.0      6
88.0      6
89.0      3
90.0      2
91.0      3
93.0      3
95.0      2
96.0      2
97.0      1
104.0     2
108.0     1
113.0     2
Name: Weight, dtype: int64

In [55]:
wnba['Weight'].value_counts(bins = 10).sort_index()

(54.941, 60.8]     5
(60.8, 66.6]      21
(66.6, 72.4]      10
(72.4, 78.2]      33
(78.2, 84.0]      31
(84.0, 89.8]      24
(89.8, 95.6]      10
(95.6, 101.4]      3
(101.4, 107.2]     2
(107.2, 113.0]     3
Name: Weight, dtype: int64

In [57]:
grouped_freq_table = wnba['PTS'].value_counts(bins=10,normalize=True).sort_index(ascending=False)*100
grouped_freq_table

(525.8, 584.0]     3.496503
(467.6, 525.8]     2.797203
(409.4, 467.6]     5.594406
(351.2, 409.4]     6.993007
(293.0, 351.2]     5.594406
(234.8, 293.0]    11.888112
(176.6, 234.8]    13.986014
(118.4, 176.6]    11.888112
(60.2, 118.4]     16.783217
(1.417, 60.2]     20.979021
Name: PTS, dtype: float64

In [58]:
wnba['PTS'].value_counts(bins = 5).sort_index()

(1.417, 118.4]    54
(118.4, 234.8]    37
(234.8, 351.2]    25
(351.2, 467.6]    18
(467.6, 584.0]     9
Name: PTS, dtype: int64

### Information and comprehensability

In [65]:
from IPython.display import Image
Image("Information and comprehensability.JPG")

<IPython.core.display.Image object>

As a rule of thumb, 10 is a good number of class intervals to choose because it offers a good balance between information and comprehensibility.

In [66]:
wnba['MIN'].value_counts(bins=10)

(816.8, 917.4]     19
(10.993, 112.6]    19
(213.2, 313.8]     17
(615.6, 716.2]     15
(313.8, 414.4]     15
(716.2, 816.8]     14
(414.4, 515.0]     13
(515.0, 615.6]     12
(112.6, 213.2]     10
(917.4, 1018.0]     9
Name: MIN, dtype: int64

### Readability for Grouped Frequency Tables

In [68]:
intervals = pd.interval_range(start = 0, end = 600, freq = 100)
intervals

IntervalIndex([(0, 100], (100, 200], (200, 300], (300, 400], (400, 500], (500, 600]]
              closed='right',
              dtype='interval[int64]')

In [69]:
gr_freq_table = pd.Series([0,0,0,0,0,0], index = intervals)
gr_freq_table

(0, 100]      0
(100, 200]    0
(200, 300]    0
(300, 400]    0
(400, 500]    0
(500, 600]    0
dtype: int64

In [72]:
for value in wnba['PTS']:
    for interval in intervals:
        if value in interval:
            gr_freq_table.loc[interval] += 1
            break
        
gr_freq_table

(0, 100]      147
(100, 200]     28
(200, 300]     32
(300, 400]     17
(400, 500]     10
(500, 600]      7
dtype: int64

In [75]:
intervals = pd.interval_range(start = 0, end = 600, freq = 60)
gr_freq_table_10 = pd.Series([0 for i in range(10)], index = intervals)

for value in wnba['PTS']:
    for interval in intervals:
        if value in interval:
            gr_freq_table_10.loc[interval] += 1
            break
            
gr_freq_table_10

(0, 60]       30
(60, 120]     25
(120, 180]    17
(180, 240]    22
(240, 300]    15
(300, 360]     7
(360, 420]    11
(420, 480]     7
(480, 540]     4
(540, 600]     5
dtype: int64