In [1]:
import math
import pandas as pd
import scipy.stats as st

## RANDOMIZED BLOCK

In [2]:
# 1 - wants to know if music has effect on amount of time customers spend at their tables for dinner
# BLOCKED ANOVA
# measured average time spent over 5 months of the year. 

var_setup = {
    'summary': ['no music', 'slow music', 'fast music'],
    'avg': [97.4, 106.8, 100.4],
    'variance': [998.8, 749.2, 1076.3],
    'n': [5, 5, 5]
}

vals = pd.DataFrame(var_setup)
vals.set_index('summary', inplace=True)
vals

Unnamed: 0_level_0,avg,variance,n
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no music,97.4,998.8,5
slow music,106.8,749.2,5
fast music,100.4,1076.3,5


In [3]:
month_setup = {
    'month': ['january', 'march', 'may', 'august', 'october'],
    'avg': [66.67, 121.0, 106.33, 75.0, 138.67],
    'variance': [69.33, 43.0, 52.33, 39.0, 9.33]
}

months = pd.DataFrame(month_setup)
months.set_index('month', inplace=True)
months

Unnamed: 0_level_0,avg,variance
month,Unnamed: 1_level_1,Unnamed: 2_level_1
january,66.67,69.33
march,121.0,43.0
may,106.33,52.33
august,75.0,39.0
october,138.67,9.33


In [5]:
x_bar = round(sum(vals.avg.values * vals.n.values) / sum(vals.n.values), 4)
print(f"x_bar = {x_bar}")

s_x2 = 166163  # given

t = len(vals)
print(f"t: {t}")

b = max(vals.n)
print(f"b: {b}")

N =  b * t
print(f"N: {N}\n")

df_b = b - 1
df_t = t - 1
df_e = (t - 1) * (b - 1)
df_N = N - 1

print(f"DFT: {df_t}\nDFB: {df_b}\nDFE: {df_e}\nDFN: {df_N}\n")

tss = s_x2 - ((sum(vals.avg * vals.n)**2) / N)

sst = round(b * sum((vals.avg - x_bar)**2), 4)
print(f"SST: {sst}")

ssb = round(t * sum((months.avg - x_bar)**2), 4)
print(f"SSB: {ssb}")

sse = round(tss - sst - ssb, 4)
print(f"SSE: {sse}")
print(f"TSS: {tss}\n")

mst = round(sst / df_t, 4)
print(f"MST: {mst}")

msb = round(ssb / df_b, 4)
print(f"MSB: {msb}")

mse = round(sse / df_e, 4)
print(f"MSE: {mse}\n")

F_t = round(mst / mse, 4)
F_b = round(msb / mse, 4)

print(f"FT: {F_t}")
print(f"FB: {F_b}")

alpha = 0.05

crit_f = round(st.f.ppf(1-alpha, df_t, df_e), 4)
print(f"Critical value: {crit_f}")
print()
if F_t > crit_f:
    print(f'There is sufficient evidence to conclude not all averages of music type and existence are similar')
else:
    print(f'There is NOT sufficient evidence to conclude not all averages of music type and existence are similar')

x_bar = 101.5333
t: 3
b: 5
N: 15

DFT: 2
DFB: 4
DFE: 8
DFN: 14

SST: 230.5333
SSB: 11101.6828
SSE: 195.5172
TSS: 11527.733333333337

MST: 115.2666
MSB: 2775.4207
MSE: 24.4397

FT: 4.7164
FB: 113.562
Critical value: 4.459

There is sufficient evidence to conclude not all averages of music type and existence are similar


F2,8,05 = 4.459 

Since 4.716 >4.459, sufficient evidence to conclude  not all 
average times are the same.

In [6]:
# b - find the relative efficiency

mse_cr = ((b-1)*msb) + (b * (t - 1) * mse)
mse_rb = ((b * t) - 1) * mse

rel_eff = round(mse_cr / mse_rb, 4)
print(f"Relative efficiency: {rel_eff}")

# We are [RE] fold better than having done a randomized selection

Relative efficiency: 33.1606


In [7]:
# c - find paires of means that are significantly different

diffs = []
diffs.append(round(abs(vals.loc["no music", "avg"] - vals.loc["slow music", "avg"]), 4))
diffs.append(round(abs(vals.loc["no music", "avg"] - vals.loc["fast music", "avg"]), 4))
diffs.append(round(abs(vals.loc["slow music", "avg"] - vals.loc["fast music", "avg"]), 4))

print(f"Differences: {diffs}")

alpha = 0.05
q_val = round(st.studentized_range.ppf(1 - alpha, t, df_e), 4)
print(f"q-value: {q_val}")

cr = round(q_val * math.sqrt(mse / b), 4)
print(f"Critical Range: {cr}")

# CR > Q --> so mean times customers stay at their tables for dinner is different when no music 
# is played relative to when slow music is played 

Differences: [9.4, 3.0, 6.4]
q-value: 4.041
Critical Range: 8.9341


## ONE WAY ANOVA

In [10]:
# 2
# Test hypothesis that avg num of hurs worked per day differs between US, China, Sweden
# Random samp 4 from each country chosen. 

# ONE WAY ANOVA

# Is there sufficient evidence to conclude that the average num of hours worked per week is different between countries?

# Ho: mui = muj
# Ha: mui <> muj

val_setup = {
    'Countries': ['United States', 'Sweden', 'China'],
    'avg': [36, 30, 42],
    'sd2': [19.3333, 18, 50],
    'n': [4, 4, 4]
}

vals = pd.DataFrame(val_setup)
vals.set_index('Countries', inplace=True)
vals

Unnamed: 0_level_0,avg,sd2,n
Countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,36,19.3333,4
Sweden,30,18.0,4
China,42,50.0,4


In [11]:
# a. Conduct an analysis of variance, and summarize your results in an AOV table

t = len(vals)
print(f"t: {t}")

N = sum(vals.n)
print(f"N: {N}")
print()

df_t = t - 1
df_w = N - t
df_N = N - 1

print(f"DFT: {df_t}\nDFW: {df_w}\nDFN: {df_N}\n")

x_bar = round(sum(vals.avg.values * vals.n.values) / sum(vals.n.values), 4)
print(f"x_bar = {x_bar}")
print()
ssb = round(sum(vals.n * (vals.avg - x_bar)**2), 4)
print(f"SSB: {ssb}")

ssw = round(sum((vals.n - 1) * vals.sd2), 4)
print(f"SSW: {ssw}")

sst = ssb + ssw
print(f"SST: {sst}")
print()
msb = round(ssb / (t - 1), 4)
print(f"MSB: {msb}")

msw = round(ssw / (N - t), 4)
print(f"MSW: {msw}")

f = round(msb / msw, 4)
print(f"F-value: {f}")

t: 3
N: 12

DFT: 2
DFW: 9
DFN: 11

x_bar = 36.0

SSB: 288.0
SSW: 261.9999
SST: 549.9999

MSB: 144.0
MSW: 29.1111
F-value: 4.9466


In [12]:
# HYPOTHESIS TEST
alpha = 0.05

# Critical value
crit_val = round(st.f.ppf(1 - alpha, df_b, df_w),4)
print(f"Critical value: {crit_val}")

# Get p-value
p_val = round(1 - st.f.cdf(f, df_b, df_w), 4)
print(f"P value: {p_val}")

if f > crit_val:
    print(f"ACCEPT ALTERNATIVE, THERE IS EVIDENCE TO STATE NOT ALL N CATEGORIES HAVE SAME AVG")
else:
    print(f"FAIL TO REJECT NULL, THERE IS NOT ENOUGH EVIDENCE TO STATE NOT ALL N CATEGORIES HAVE SAME AVG")

Critical value: 3.6331
P value: 0.0219
ACCEPT ALTERNATIVE, THERE IS EVIDENCE TO STATE NOT ALL N CATEGORIES HAVE SAME AVG


In [26]:
# find which pairs of means are significantly different

diffs = []
diffs.append(round(abs(vals.loc["United States", "avg"] - vals.loc["Sweden", "avg"]), 4))
diffs.append(round(abs(vals.loc["United States", "avg"] - vals.loc["China", "avg"]), 4))
diffs.append(round(abs(vals.loc["Sweden", "avg"] - vals.loc["China", "avg"]), 4))

print(f"Differences: {diffs}")

alpha = 0.05
q_val = round(st.studentized_range.ppf(1 - alpha, t, df_w), 4)
print(f"q-value: {q_val}")

ranges = []
ranges.append(round(q_val * math.sqrt((msw / 2) * ((1 / vals.loc["United States", "n"]) + (1 / vals.loc["Sweden", "n"]))), 4))
ranges.append(round(q_val * math.sqrt((msw / 2) * ((1 / vals.loc["United States", "n"]) + (1 / vals.loc["China", "n"]))), 4))
ranges.append(round(q_val * math.sqrt((msw / 2) * ((1 / vals.loc["Sweden", "n"]) + (1 / vals.loc["China", "n"]))), 4))

print(f"Critical ranges: {ranges}")

for i in diffs:
    for j in ranges:
        if i > j:
            print(f"{i} is different")

Differences: [6, 6, 12]
q-value: 3.9485
Critical ranges: [10.652, 10.652, 10.652]
12 is different
12 is different
12 is different


## ONE WAY LINEAR REGRESSION

In [31]:
# customer service department wants to predict rating on phone service by number of minutes person was on hold

# a - Determine least-squares prediction equation

sm_x = 50
sm_xx = 84
sm_y = 147
sm_yy = 70.1
sm_xy = -41

n = 10
df = n - 2
y_bar = sm_y / n
x_bar = sm_x / n

b1 = round(sm_xy / sm_xx, 4)
print(f"coefficient: {b1}")

b0 = round(y_bar - (b1 * x_bar), 4)
print(f"intercept: {b0}")

sol = b0 - (b1 * x_bar)

print(f"Linear equation: prediction = {b0} + ({b1} * (variable))")

coefficient: -0.4881
intercept: 17.1405
Linear equation: prediction = 17.1405 + (-0.4881 * (variable))


In [29]:
# Predict rating if customer was on hold for 5 mins

x_bar = 5
pred = round(b0 + (b1 * x_bar), 4)
print(f"Prediction when x = {x_bar}: {pred}")

Prediction when x = 5: 14.7


In [38]:
# c - is there suff. ev. to conclude that pop slope of line is less than 0? Find p-value

# Ho: b1 = 0
# Ha: b1 < 0

alpha = 0.05
sm_yx = 2.5022

T = round((b1 - 0) / (sm_yx / math.sqrt(sm_xx)), 4)

print(f"T: {T}")

crit_val = round(st.t.ppf(alpha, df), 4)
print(f"Crit val: {crit_val}")

if abs(T) > abs(crit_val):
    print(f"There is sufficient evidence to conclude: ")
else:
    print(f"There is not sufficient evidence to claim: ")

print(f"P-value: {round(st.t.cdf(T, df), 4)}")

T: -1.7878
Crit val: -1.8595
There is not sufficient evidence to claim: 
P-value: 0.0558


In [40]:
# d - find correlation coefficient

r = round(sm_xy / (math.sqrt(sm_xx) * math.sqrt(sm_yy)), 4)
print(f"r: {r}")

r: -0.5343


In [49]:
# owner wants to predict daily demand in order to improve inventory
# 3 vars - price, high temp during day, day on weekend (1 = weekday, 0=weekend)

n = 12
df_reg = vari = 3
df_tot = n - 1
df_err = df_tot - df_reg


ssr = 10220.4191
sst = 12321
sse = sst - ssr

print(f"SSReg: {ssr}\nSSRes: {sse}\nSSTot: {sst}\n")

print(f"DFReg: {df_reg}\nDFErr: {df_err}\nDFTot: {df_tot}\n")

msr = round(ssr / df_reg, 4)
mse = round(sse / df_e, 4)

print(f"MSReg: {msr}\nMSRes: {mse}\n")

f = round(msr / mse, 4)
print(f"F val: {f}\n")

SSReg: 10220.4191
SSRes: 2100.580900000001
SSTot: 12321

DFReg: 3
DFErr: 8
DFTot: 11

MSReg: 3406.8064
MSRes: 262.5726

F val: 12.9747


In [51]:
# b - what do you conclude?

alpha = 0.05

crit_val = round(st.f.ppf(1 - alpha, df_reg, df_err), 4)
print(f"Crit val: {crit_val}")

if f > crit_val:
    print(f"Can conclude there is enough evidence")
else:
    print(f"Cannot conclude there is enough evidence")

Crit val: 4.0662
Can conclude there is enough evidence


In [52]:
# c - calculate coefficient of determination

r2 = round(ssr / sst, 4)
print(f"R2: {r2}")

R2: 0.8295


In [53]:
# d - based on plot, is equal variance violated? - no, approx "horizontal sleeve"
# e - based on plol, is it normal? yes, approx

## CHI-SQUARE GOODNESS OF FIT

In [89]:
val_setup = {
    'front': {
        'A': 18,
        'B': 55,
        'C': 30,
        'D': 3,
        'F': 0
    },
    'middle': {
        'A': 7,
        'B': 42,
        'C': 95,
        'D': 11,
        'F': 1
    },    
    'back': {
        'A': 3,
        'B': 15,
        'C': 104,
        'D': 14,
        'F': 2
    },
}

vals = pd.DataFrame(val_setup).transpose()
vals

Unnamed: 0,A,B,C,D,F
front,18,55,30,3,0
middle,7,42,95,11,1
back,3,15,104,14,2


In [90]:
# is there sufficient evidence to indicate rel. between seating grade at alpha = 0.01? what is p-value?

# since sum of F is < 5, roll into D

vals["D"] = vals["D"] + vals["F"]
vals.drop("F", axis=1, inplace=True)
type(vals)

pandas.core.frame.DataFrame

In [92]:
# establish expected variables

col_sums = vals.sum()
# col_sums
# get row sums
row_sums = vals.sum()
# row_sums
total_sum = sum(vals.values)
# total_sum

for row in row_sums:
    n_row = []
    for col in col_sums:
        n_row.append((col * row) / total_sum)
    vals.append(n_row)

# vals

# prep expected df for mathematics 
df_new = pd.DataFrame(vals)
df_new.columns = vals.columns[1:]
df_new.index = vals.index.values

# prep observered df for mathematics
# df.set_index("University", inplace=True)

# show expected values
df_new

ValueError: ValueError: Length mismatch: Expected axis has 4 elements, new values have 3 elements