T1)The Cozine Corporation operates a garbage hauling business. Up to this point, the company has been
charged a flat fee for each of the garbage trucks that enter the county landfill. The flat fee is based on
the assumed truck weight of 45,000 pounds. In two weeks, the company is required to appear before
the county commissioners to discuss a rate adjustment. In preparation for this meeting, Cozine has hired
an independent company to weigh a sample of Cozine’s garbage trucks just prior to their entering the
landfill. The data file COZINE contains the data the company has collected.

In [2]:
import pandas as pd
import numpy as np

In [3]:
garbage = pd.read_excel('D://Business Statistic/MANB1123-master/MANB1123-master/Data_Set/Cozine.xlsx', header = None)
garbage.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,40859,37145,42928,46851,46554,48586,33702,41110,46161,37871
1,39377,35576,34982,38285,39061,33952,39842,40465,42512,40611
2,40757,40593,47102,41676,41293,40050,49494,45290,51027,39511
3,48314,35873,44048,45428,49292,41679,40010,44566,40551,44879
4,36512,38780,36218,40621,41877,42107,40774,50339,35379,39201


In [4]:
garbage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 10 columns):
0    20 non-null int64
1    20 non-null int64
2    20 non-null int64
3    20 non-null int64
4    20 non-null int64
5    20 non-null int64
6    20 non-null int64
7    20 non-null int64
8    20 non-null int64
9    20 non-null int64
dtypes: int64(10)
memory usage: 1.6 KB


In [5]:
garbage.median()

0    42494.5
1    43642.5
2    42917.5
3    40475.0
4    42413.5
5    41972.5
6    42063.5
7    44133.5
8    42335.5
9    42195.0
dtype: float64

In [6]:
import math
import functools

In [7]:
def percentile(N, percent, key=lambda x:x):
    """
    Find the percentile of a list of values.

    @parameter N - is a list of values. Note N MUST BE already sorted.
    @parameter percent - a float value from 0.0 to 1.0.
    @parameter key - optional key function to compute value from each element of N.

    @return - the percentile of the values
    """
    if not N:
        return None
    k = (len(N)-1) * percent
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return key(N[int(k)])
    d0 = key(N[int(f)]) * (c-k)
    d1 = key(N[int(c)]) * (k-f)
    return d0+d1

# median is 50th percentile.
median = functools.partial(percentile, percent=0.5)

In [8]:
percentile(list(garbage[0]),0.5)

41849.0

In [9]:
import plotly.plotly as py
import plotly.graph_objs as go

In [41]:
y0 = garbage[0]
y1 = garbage[1]
y2 = garbage[2]
y3 = garbage[3]
y4 = garbage[4]
y5 = garbage[5]
y6 = garbage[6]
y7 = garbage[7]


column0 = go.Box(
    y=y0
)
column1 = go.Box(
    y=y1
)
column2 = go.Box(
    y=y2
)
column3 = go.Box(
    y=y3
)
column4 = go.Box(
    y=y4
)
column5 = go.Box(
    y=y5
)
column6 = go.Box(
    y=y6
)
column7 = go.Box(
    y=y7
)
data = [column0, column1, column2, column3, column4, column5, column6, column7]
py.iplot(data, sharing='public')

# Compute appropriate measures of central location for the data.

In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (20, 10)
plt.rcParams['font.size'] = 14

In [12]:
a = garbage[0]
b = garbage[1]

trace1 = go.Histogram(
    x=garbage[0],
    opacity=0.75,
    histnorm='probability',
    name='column 0'
)
trace2 = go.Histogram(
    x=garbage[1],
    opacity=0.75,
    histnorm='probability',
    name='column 1',
    yaxis='y2'
)

In [13]:
data = [trace1, trace2]

layout = go.Layout(
    title='item',
    barmode='overlay',
    xaxis=dict(
    title=''
    ),
    yaxis=dict(
        title='Normalized Frequency 1'
    ),
    yaxis2=dict(
        title='Normalized Frequency 2',
        anchor='free',
        overlaying='y',
        side='right',
        position=1
    ),
    
     # Mean lines
    shapes= [{'line': {'color': '#0099FF', 'dash': 'solid', 'width': 1},
    'type': 'line',
    'x0': garbage[0].mean(),
    'x1': garbage[1].mean(),
    'xref': 'x',
    'y0': -0.1,
    'y1': 1,
    'yref': 'paper'},
   {'line': {'color': '#FDAB5A', 'dash': 'solid', 'width': 1},
    'type': 'line',
    'x0': garbage[1].mean(),
    'x1': garbage[1].mean(),
    'xref': 'x',
    'y0': -0.1,
    'y1': 1,
    'yref': 'paper'}],

    # Annotations
    annotations=[
        dict(
            x=garbage[0].mean(),
            y=1,
            xref='x',
            yref='paper',
            text="Mean a = {:,.0f}".format(garbage[0].mean()),
            showarrow=True,
            arrowhead=7,
            ax=1,
            ay=1,
            axref='paper',
            ayref='paper'
        ),
        dict(
            x=garbage[1].mean(),
            y=0.95,
            xref='x',
            yref='paper',
            text="Mean b = {:,.0f}".format(garbage[1].mean()),
            showarrow=True,
            arrowhead=7,
            ax=1,
            ay=1,
            axref='paper',
            ayref='paper'
        )
    ]

) 
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [14]:
a = garbage[0]
b = garbage[1]

trace1 = go.Histogram(
    x=garbage[0],
    opacity=0.75,
    name='column 0'
)
trace2 = go.Histogram(
    x=garbage[1],
    opacity=0.75,
    name='column 1',
    yaxis='y2'
)

In [15]:
data = [trace1, trace2]

layout = go.Layout(
    title='item',
    barmode='overlay',
    xaxis=dict(
    title=''
    ),
    yaxis=dict(
        title='Frequency 1'
    ),
    yaxis2=dict(
        title='Frequency 2',
        anchor='free',
        overlaying='y',
        side='right',
        position=1
    ),
    
     # Mean lines
    shapes= [{'line': {'color': '#0099FF', 'dash': 'solid', 'width': 1},
    'type': 'line',
    'x0': garbage[0].mean(),
    'x1': garbage[1].mean(),
    'xref': 'x',
    'y0': -0.1,
    'y1': 1,
    'yref': 'paper'},
   {'line': {'color': '#FDAB5A', 'dash': 'solid', 'width': 1},
    'type': 'line',
    'x0': garbage[1].mean(),
    'x1': garbage[1].mean(),
    'xref': 'x',
    'y0': -0.1,
    'y1': 1,
    'yref': 'paper'}],

    # Annotations
    annotations=[
        dict(
            x=garbage[0].mean(),
            y=1,
            xref='x',
            yref='paper',
            text="Mean a = {:,.0f}".format(garbage[0].mean()),
            showarrow=True,
            arrowhead=7,
            ax=1,
            ay=1,
            axref='paper',
            ayref='paper'
        ),
        dict(
            x=garbage[1].mean(),
            y=0.95,
            xref='x',
            yref='paper',
            text="Mean b = {:,.0f}".format(garbage[1].mean()),
            showarrow=True,
            arrowhead=7,
            ax=1,
            ay=1,
            axref='paper',
            ayref='paper'
        )
    ]

) 
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [16]:
data = list(garbage[0]) + list(garbage[1]) + list(garbage[2]) + list(garbage[3]) + list(garbage[4]) + list(garbage[5]) + list(garbage[6]) + list(garbage[7]) + list(garbage[8]) + list(garbage[9]) 
data

[40859,
 39377,
 40757,
 48314,
 36512,
 32205,
 44637,
 42275,
 50382,
 43688,
 40010,
 39912,
 45945,
 40325,
 42714,
 43535,
 47145,
 43652,
 42981,
 41176,
 37145,
 35576,
 40593,
 35873,
 38780,
 47501,
 43226,
 45153,
 47486,
 44348,
 45228,
 45227,
 42545,
 36809,
 44059,
 39537,
 48415,
 47523,
 47609,
 40694,
 42928,
 34982,
 47102,
 44048,
 36218,
 37137,
 38429,
 45276,
 46955,
 42811,
 43950,
 45052,
 46336,
 49573,
 41051,
 44748,
 42907,
 41539,
 42324,
 41379,
 46851,
 38285,
 41676,
 45428,
 40621,
 39516,
 41084,
 39581,
 42429,
 38098,
 39688,
 43734,
 41436,
 39847,
 37360,
 43996,
 42551,
 38345,
 37756,
 40329,
 46554,
 39061,
 41293,
 49292,
 41877,
 44879,
 42500,
 38492,
 42007,
 46705,
 46959,
 44627,
 39037,
 42327,
 46813,
 46085,
 49047,
 36153,
 36718,
 38950,
 48586,
 33952,
 40050,
 41679,
 42107,
 43774,
 44120,
 46223,
 43724,
 40817,
 35308,
 48196,
 46089,
 41110,
 40902,
 31476,
 41838,
 32253,
 42728,
 44450,
 33702,
 39842,
 49494,
 40010,
 40774,


In [17]:
df = pd.DataFrame(data)

In [18]:
df.head()

Unnamed: 0,0
0,40859
1,39377
2,40757
3,48314
4,36512


In [19]:
df.shape[0]

200

In [20]:
df.describe()

Unnamed: 0,0
count,200.0
mean,42260.64
std,3934.55888
min,31476.0
25%,39686.5
50%,42325.5
75%,44923.0
max,52774.0


In [21]:
df.median()

0    42325.5
dtype: float64

# a) Based on the sample data, what percentile does the 45,000-pound weight fall closest to?

In [22]:
np.percentile(data, 50)

42325.5

In [23]:
np.percentile(data, 75)

44923.0

In [24]:
y0 = df[0]

column0 = go.Box(y=y0)
data1 = [column0]
py.iplot(data1)


# b) Compute appropriate measures of central location for the data.

In [25]:
df.median()

0    42325.5
dtype: float64

# c) Construct a frequency histogram based on the sample data. Use the 2 power k more than n guideline to determine the number of classes. Also, construct a box and whisker plot for these data. Discuss the relative advantages of histograms and box and whisker plots for presenting these data.

In [26]:
x = df[0]
data2 = [go.Histogram(x=x)]

py.iplot(data2, filename='basic histogram')

In [40]:
highest_num = df[0].max()
lowest_num = df[0].min()

bins_size = (highest_num - lowest_num) / 10 # assume k = 10

trace1 = go.Histogram(
    x=df[0],
    opacity=0.75,
    name='column 0',
    xbins=dict(
        start=30000,
        end=55000,
        size=bins_size),
    
)
data = [trace1]

layout = go.Layout(
    title='Truck Weight',
    barmode='overlay',
    xaxis=dict(
    title='Weight'
    ),
    yaxis=dict(
        title='Frequency'
    )


) 
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

# d) Use the information determined in parts (a–c) to develop a presentation to the county commissioners. Make sure the presentation attempts to answer the question of whether Cozine deserves a rate reduction.

In [28]:
more = df[0] > 45000
more.sum()  #get total number of weight more than 45000

49

In [29]:
less = df[0] < 45000
less.sum()  #get total number of weight less than 45000

151

Cozine deserve a rate reduction because most of their lorry weight is less than 45000. Furthermore the mean weight is 422641 and the median weight is 42325. 45000 weight is closest to 75% percentile