# Confidence intervals

Based on https://www.statology.org/confidence-intervals-python/

In [49]:
# Libraries
import numpy as np
import pandas as pd
from datetime import datetime as dt
from scipy import stats

## Data preparation

In [50]:
# Read the input file
# Data extracted from DGT (Direccion General de Trafico) statistics and indicators webpage
# https://www.dgt.es/es/seguridad-vial/estadisticas-e-indicadores/matriculaciones-definitivas/tablas-estadisticas/
df_cars_brand = pd.read_csv('data/matriculacions_turismes_2020.txt',sep='\t',encoding='latin-1')

In [51]:
df_cars_brand.head()

Unnamed: 0,PROVINCIAS,A_BRUNS_LINDER,A.M.C.,AC_CARS,ACURA,ADRIA,ALFA_ROMEO,ALLIED_VEHICLES_LTD,ALPINA,ALPINE,...,VOLKNER,VOLKSWAGEN,VOLKSWAGEN_AG,VOLKSWAGEN_V_W,VOLVO,VW-PORSCHE,WESTFIELD,WIESMANN,WILLYS_OVERLAND,WILLYS_VIASA
0,Araba/Álava,0,0,0,0,0,10,0,0,0,...,0,289,0,18,157,0,0,0,0,0
1,Albacete,0,0,0,0,0,142,0,0,0,...,0,442,0,9,98,0,0,0,0,0
2,Alicante/Alacant,0,0,0,1,1,58,0,0,0,...,0,2439,0,9,298,0,0,0,0,0
3,Almería,0,0,0,0,0,6,0,0,0,...,1,733,2,35,150,0,0,0,0,0
4,Ávila,0,0,0,0,0,24,0,0,0,...,0,10,0,1,77,0,0,0,0,0


### Question: Are high-quality weed prices in Jan 2014 significantly higher than in Jan 2015?

### Small sample (n <30)

If we’re working with a small sample (n <30), we can use the t.interval() function from the scipy.stats library to calculate a confidence interval for a population mean.

In [55]:
# We use data from only 28 provinces
data = df_cars_brand[df_cars_brand['PROVINCIAS']<='Lleida']['VOLKSWAGEN']

In [56]:
data.describe()

count      28.000000
mean      997.500000
std      1574.999718
min        56.000000
25%       344.250000
50%       603.500000
75%      1014.750000
max      8531.000000
Name: VOLKSWAGEN, dtype: float64

In [57]:
stats.t.interval(alpha=0.95, df=len(data)-1, loc=np.mean(data), scale=stats.sem(data))

(386.77886552643054, 1608.2211344735695)

In [58]:
stats.t.interval(alpha=0.99, df=len(data)-1, loc=np.mean(data), scale=stats.sem(data))

(172.81461529162118, 1822.185384708379)

### Larger samples (n≥30)

If we’re working with larger samples (n≥30), we can assume that the sampling distribution of the sample mean is normally distributed (thanks to the Central Limit Theorem) and can instead use the norm.interval() function from the scipy.stats library.

In [59]:
# We use data from all 52 provinces
data = df_cars_brand['VOLKSWAGEN']

In [60]:
data.describe()

count       52.000000
mean      1503.980769
std       4019.365141
min         10.000000
25%        286.750000
50%        603.500000
75%       1160.250000
max      28353.000000
Name: VOLKSWAGEN, dtype: float64

In [61]:
# 95% confidence interval
stats.norm.interval(alpha=0.95, loc=np.mean(data), scale=stats.sem(data))

(411.52495384248823, 2596.4365846190503)

In [48]:
# 99% confidence interval
stats.norm.interval(alpha=0.99, loc=np.mean(data), scale=stats.sem(data))

(68.25045770165184, 2939.7110807598865)