In [1]:
import numpy as np
import pandas as pd

import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

### Data Decsription:
**MPG stands for miles per gallon and is used to show how far your car can travel for every gallon (or 4.55 litres) of fuel it uses.**
<br/>
Origin Values:
* 1: USA
* 2: Europ
* 3: Asia


In [2]:
colnames = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
            'acceleration', 'model year', 'origin', 'car name']

In [3]:
df = pd.read_csv('../../Datasets/auto_mpg/auto-mpg.data', names=colnames, delim_whitespace=True,
                 skipinitialspace=True)

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [5]:
df['origin'].value_counts()

1    249
3     79
2     70
Name: origin, dtype: int64

In [6]:
print('American cars mpg mean:', df.query('origin == 1')['mpg'].mean())
print('European cars mpg mean:', df.query('origin == 2')['mpg'].mean())
print('Asian    cars mpg mean:', df.query('origin == 3')['mpg'].mean())

American cars mpg mean: 20.083534136546184
European cars mpg mean: 27.89142857142857
Asian    cars mpg mean: 30.450632911392404


----------------------
**Its possible to make HT test with probability distribution tests instead of coding. there is a flowchart to choose propriate test based on available data and conditions**

## T Distribution Test
#### NOTE:
* 1- T distribution test needs normal or semi normal data
* 2- It is assumed that the experiments were performed under the same conditions for both groups.

### 1) One Samlple t test - A One sample t-test tests the mean of a single group against a known mean.


**While a sample and reference (total) mean is available**

In [7]:
df_euro = df.query('origin == 2').copy()
# or may use: df[df['origin'] == 2]

In [8]:
df_others = df.query('origin==1 | origin==3').copy()
df_others['origin'].value_counts()

1    249
3     79
Name: origin, dtype: int64

In [9]:
df_euro.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
19,26.0,4,97.0,46.0,1835.0,20.5,70,2,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87.0,2672.0,17.5,70,2,peugeot 504
21,24.0,4,107.0,90.0,2430.0,14.5,70,2,audi 100 ls
22,25.0,4,104.0,95.0,2375.0,17.5,70,2,saab 99e
23,26.0,4,121.0,113.0,2234.0,12.5,70,2,bmw 2002


In [10]:
population_mean=df_others['mpg'].mean()
euro_mpg_mean = df_euro['mpg'].mean()

print(f'population mpg mean: {population_mean:.2f}')
print(f'European cars mpg mean: {euro_mpg_mean:.2f}')
print(f'mean diffrentiation: {population_mean - euro_mpg_mean:.2f}')


population mpg mean: 22.58
European cars mpg mean: 27.89
mean diffrentiation: -5.31


In [11]:
# df =  degree of reedom 
t_satistic, pvalue = stats.ttest_1samp(a=df_euro['mpg'], popmean=population_mean)
print(f'pvalue: {pvalue:.10f}')
print(f'mean diff is {pvalue * 100:.2f} % based on chance. or {(1 - pvalue) * 100:.2f} % is significant.')

pvalue: 0.0000000068
mean diff is 0.00 % based on chance. or 100.00 % is significant.


------------------
### 2) Two Samlples (Independent) T test - An Independent Samples t-test compares the means for two groups.

In [12]:
df_asia = df.query('origin == 3').copy()

In [13]:
asia_mpg_mean = df_asia['mpg'].mean()

print(f'Asian mpg mean: {asia_mpg_mean:.2f}')
print(f'European cars mpg mean: {euro_mpg_mean:.2f}')
print(f'mean diffrentiation: {asia_mpg_mean - euro_mpg_mean:.2f}')

Asian mpg mean: 30.45
European cars mpg mean: 27.89
mean diffrentiation: 2.56


In [14]:
t_satistic, pvalue = stats.ttest_ind(a=df_euro['mpg'], b=df_asia['mpg'])
print(f'pvalue: {pvalue}')
print(f'mean diff is {pvalue * 100:.2f} % based on chance. or {(1 - pvalue) * 100:.2f} % is significant.')

pvalue: 0.015968231401682974
mean diff is 1.60 % based on chance. or 98.40 % is significant.


------------------
### 3) Releative T test
#### A Paired sample t-test compares means from the same group at different times (say, one year apart).

In [15]:
df_usa = df.query('origin == 1').copy()

In [16]:
# check ford company model years
df_usa[(df_usa['car name'].str.contains("ford", na=False))]['model year'].value_counts()

71    5
73    5
75    5
76    5
70    4
72    4
74    4
78    4
82    4
77    3
79    3
81    3
80    2
Name: model year, dtype: int64

**Now divide ford models to 70th and 80th decade**
<br/>
**To make a semi-balanced data set, only select model years between 70-71 and 80-82**

In [17]:
ford_70th = df_usa[(df_usa['car name'].str.contains("ford", na=False))
                   & (df_usa['model year'] >= 70 ) & (df_usa['model year'] < 72 )][['mpg', 'car name']]
ford_70th.head()

Unnamed: 0,mpg,car name
4,17.0,ford torino
5,15.0,ford galaxie 500
17,21.0,ford maverick
25,10.0,ford f250
32,25.0,ford pinto


In [18]:
len(ford_70th)

9

In [19]:
ford_80th = df_usa[(df_usa['car name'].str.contains("ford", na=False)) & (df_usa['model year'] >= 80 )][['mpg', 'car name']]
ford_80th.head()

Unnamed: 0,mpg,car name
314,26.4,ford fairmont
336,23.6,ford mustang cobra
351,34.4,ford escort 4w
352,29.9,ford escort 2h
365,20.2,ford granada gl


In [20]:
len(ford_80th)

9

In [21]:
ford_70th_mean = ford_70th['mpg'].mean()
ford_80th_mean = ford_80th['mpg'].mean()

print(f'ford 70th mpg mean: {ford_70th_mean:.2f}')
print(f'ford 80th mpg mean: {ford_80th_mean:.2f}')
print(f'mean diffrentiation: {ford_80th_mean - ford_70th_mean:.2f}')

ford 70th mpg mean: 16.89
ford 80th mpg mean: 26.17
mean diffrentiation: 9.28


In [22]:
t_satistic, pvalue = stats.ttest_rel(a=ford_70th['mpg'], b=ford_80th['mpg'])
print(f'pvalue: {pvalue}')
print(f'mean diff is {pvalue * 100:.2f} % based on chance. or {(1 - pvalue) * 100:.2f} % is significant.')

pvalue: 0.0035055160785659035
mean diff is 0.35 % based on chance. or 99.65 % is significant.
