In [529]:
import pandas as pd
import numpy as np
%matplotlib inline

# Describing Data Drill 
by Matthew Thomas

12/30/2017

In [530]:
def UsefulStats(df):
    v = df.var()
    se = np.std(df ,ddof=1) / np.sqrt(len(df)-1)
    mode = df.mode()
    median = df.median()
    
    print('other useful stats:')
    print()
    print('\tvariance:\t', round(v, 5))
    print('\tstandard error:\t', round(se, 5))
    if len(mode) == len(df):
        mode = 'There is no mode'
        print('\tmode:\t\t', mode)
    elif len(mode) > 1 and len(mode) < len(df):
        mode = mode
        print('\tmultiple modes:')
        for i in mode.index:
            print('\t\t\t', mode.at[i])
    else:
        mode = mode
        print('\tmode:\t\t', mode.at[0])
    print('\tmedian:\t\t', median)
    print()

In [531]:
df = pd.DataFrame()

df['brady_kid'] = ['Greg',
                     'Marcia',
                     'Peter',
                     'Jan',
                     'Bobby',
                     'Cindy',
                     'Oliver']

df['age'] = [14, 12, 11, 10, 8, 6, 8]
df 

Unnamed: 0,brady_kid,age
0,Greg,14
1,Marcia,12
2,Peter,11
3,Jan,10
4,Bobby,8
5,Cindy,6
6,Oliver,8


In [532]:
df.describe()

Unnamed: 0,age
count,7.0
mean,9.857143
std,2.734262
min,6.0
25%,8.0
50%,10.0
75%,11.5
max,14.0


In [533]:
UsefulStats(df['age'])

other useful stats:

	variance:	 7.47619
	standard error:	 1.11626
	mode:		 8
	median:		 10.0



The mean and/or median seem to be a good representation of the data since they are close. 

### Cindy has a birthday

In [534]:
df.at[5, 'age'] = 7

In [535]:
df

Unnamed: 0,brady_kid,age
0,Greg,14
1,Marcia,12
2,Peter,11
3,Jan,10
4,Bobby,8
5,Cindy,7
6,Oliver,8


In [536]:
df.describe()

Unnamed: 0,age
count,7.0
mean,10.0
std,2.516611
min,7.0
25%,8.0
50%,10.0
75%,11.5
max,14.0


In [537]:
UsefulStats(df['age'])

other useful stats:

	variance:	 6.33333
	standard error:	 1.0274
	mode:		 8
	median:		 10.0



Since Cindy had a birthday, the mean and median are now the same. This also cause the variance and standard error to go down.

### Cousin Oliver is replace by Jessica who is 1

In [538]:
df.at[6, 'brady_kid'] = 'Jessica'
df.at[6, 'age'] = 1

In [539]:
df

Unnamed: 0,brady_kid,age
0,Greg,14
1,Marcia,12
2,Peter,11
3,Jan,10
4,Bobby,8
5,Cindy,7
6,Jessica,1


In [540]:
df.describe()

Unnamed: 0,age
count,7.0
mean,9.0
std,4.242641
min,1.0
25%,7.5
50%,10.0
75%,11.5
max,14.0


In [541]:
UsefulStats(df['age'])

other useful stats:

	variance:	 18.0
	standard error:	 1.73205
	mode:		 There is no mode
	median:		 10.0



Due to Jessica's age being so far from the mean, the variance and standard error are much higher. The mean is still relatively close to the median but the median is likely to be more accurate.

### 50th Anniversary

In [542]:
# magazines and the percentage of respondents who like The Brady Bunch
df2 = pd.DataFrame()
df2['magazine'] = ['tv guide', 
                   'entertainment weekly',
                   'pop culture', 
                   'sciphi phanatics']
df2['percent_fans'] = [20, 23, 17, 5]
df2

Unnamed: 0,magazine,percent_fans
0,tv guide,20
1,entertainment weekly,23
2,pop culture,17
3,sciphi phanatics,5


In [543]:
df2.describe()

Unnamed: 0,percent_fans
count,4.0
mean,16.25
std,7.889867
min,5.0
25%,14.0
50%,18.5
75%,20.75
max,23.0


In [544]:
UsefulStats(df2['percent_fans'])

other useful stats:

	variance:	 62.25
	standard error:	 4.55522
	mode:		 There is no mode
	median:		 18.5



The median is higher than the mean and sciphi phantics seems to be intended for a small subclass of the general population so we can expect this to be an outlier. 

In [545]:
# magazines and the percentage of respondents who like The Brady Bunch
df2_general = pd.DataFrame()
df2_general['magazine'] = ['tv guide', 
                   'entertainment weekly',
                   'pop culture']
df2_general['percent_fans'] = [20, 23, 17]
df2_general

Unnamed: 0,magazine,percent_fans
0,tv guide,20
1,entertainment weekly,23
2,pop culture,17


In [546]:
df2_general.describe()

Unnamed: 0,percent_fans
count,3.0
mean,20.0
std,3.0
min,17.0
25%,18.5
50%,20.0
75%,21.5
max,23.0


In [547]:
UsefulStats(df2_general['percent_fans'])

other useful stats:

	variance:	 9.0
	standard error:	 2.12132
	mode:		 There is no mode
	median:		 20.0



If we only consider the magazines that are read by a more general population, we can get a more realistic picture. Now the mean and median are equal and 20% is a better estimate. 