In [69]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind
from scipy.stats import t
from scipy.stats import f
from scipy.stats import norm
from statsmodels.stats.proportion import proportions_ztest

In [2]:
df = pd.read_csv(
    'cars_sampled.csv',
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50001 entries, 0 to 50000
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          50001 non-null  object
 1   name                 50001 non-null  object
 2   seller               50001 non-null  object
 3   offerType            50001 non-null  object
 4   price                50001 non-null  int64 
 5   abtest               50001 non-null  object
 6   vehicleType          44813 non-null  object
 7   yearOfRegistration   50001 non-null  int64 
 8   gearbox              47177 non-null  object
 9   powerPS              50001 non-null  int64 
 10  model                47243 non-null  object
 11  kilometer            50001 non-null  int64 
 12  monthOfRegistration  50001 non-null  int64 
 13  fuelType             45498 non-null  object
 14  brand                50001 non-null  object
 15  notRepairedDamage    40285 non-null  object
 16  date

In [3]:
cars = df.copy()
cars.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,postalCode,lastSeen
0,30/03/2016 13:51,Zu_verkaufen,private,offer,4450,test,limousine,2003,manual,150,3er,150000,3,diesel,bmw,,30/03/2016 00:00,20257,07/04/2016 04:44
1,07/03/2016 09:54,Volvo_XC90_2.4D_Summum,private,offer,13299,control,suv,2005,manual,163,xc_reihe,150000,6,diesel,volvo,no,07/03/2016 00:00,88045,26/03/2016 13:17
2,01/04/2016 00:57,Volkswagen_Touran,private,offer,3200,test,bus,2003,manual,101,touran,150000,11,diesel,volkswagen,,31/03/2016 00:00,27449,01/04/2016 08:40
3,19/03/2016 17:50,Seat_Ibiza_1.4_16V_Reference,private,offer,4500,control,small car,2006,manual,86,ibiza,60000,12,petrol,seat,no,19/03/2016 00:00,34537,07/04/2016 04:44
4,16/03/2016 14:51,Volvo_XC90_D5_Aut._RDesign_R_Design_AWD_GSHD_S...,private,offer,18750,test,suv,2008,automatic,185,xc_reihe,150000,11,diesel,volvo,no,16/03/2016 00:00,55270,01/04/2016 23:18


In [6]:
"""Working range of data"""
cars=cars[
    (cars.yearOfRegistration <= 2020) &
    (cars.yearOfRegistration >= 1950) &
    (cars.powerPS >= 10) &
    (cars.powerPS <= 500) &
    (cars.price <= 150000) &
    (cars.price >= 1000)
    ]
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35731 entries, 0 to 50000
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dateCrawled          35731 non-null  object
 1   name                 35731 non-null  object
 2   seller               35731 non-null  object
 3   offerType            35731 non-null  object
 4   price                35731 non-null  int64 
 5   abtest               35731 non-null  object
 6   vehicleType          33812 non-null  object
 7   yearOfRegistration   35731 non-null  int64 
 8   gearbox              35192 non-null  object
 9   powerPS              35731 non-null  int64 
 10  model                34501 non-null  object
 11  kilometer            35731 non-null  int64 
 12  monthOfRegistration  35731 non-null  int64 
 13  fuelType             34004 non-null  object
 14  brand                35731 non-null  object
 15  notRepairedDamage    31301 non-null  object
 16  date

# Hypothesis testing steps
* Hypotheses
* Sample Statistics
* test Statistics
* Critical Values
* Max Uncertainty
* Computed Uncertainty
* Decision on HO

#
* Ho = main hypothesis testing.
* Ha = alternative hypothesis testing.



In [8]:
# Arriving at a sub sample from 'cars' data
sample_size = 1000

sample1 = cars.sample(
    sample_size,
    random_state=0 # seed for reproducibility
)

In [10]:
"""finding the mean of the sample"""
sample1.mean()

  sample1.mean()


price                    7043.476
yearOfRegistration       2004.424
powerPS                   137.069
kilometer              124285.000
monthOfRegistration         6.124
postalCode              52090.702
dtype: float64

In [17]:
pos_mean = 7000
print(sample1['price'].mean())

7043.476


In [18]:
statistic, pvalue = ttest_1samp(sample1['price'], pos_mean)
statistic, pvalue
# test statistic , pvalue == uncertainity

(0.16601088070510697, 0.8681819780607943)

In [20]:
"""Calculating the degrees of freedom"""
n = len(sample1['price'])
# degrees of freedom n-1
df = n - 1
n, df

(1000, 999)

In [21]:
"""Find the significance level"""
alpha = 0.05

In [25]:
"""Find the critical value"""
critical_value = t.ppf([alpha/2, 1-alpha], df)
critical_value


array([-1.96234146,  1.64638035])

## One Sample test for proportion
* Three years back, The precentage of ued car with automatic transmission were 23% Has it changed now?


In [None]:
"""Hypothesis test"""
# Hypothesis HO = 0.23 
            # HA != 0.23

"""
Test statistic = "z" value=? 
Critical Values = ?
Max Uncertainity = ?
Computed p-value = ?  uncertainity
Decision on HO = ?
""" 

In [31]:
"""Count of gars where the gearbox type is automatic"""
count = sample1['gearbox'].value_counts()
total_number_of_observations = len(sample1['gearbox'])

count, total_number_of_observations

# given percentage is 23% ie 0.23 aka pi value

(manual       739
 automatic    246
 Name: gearbox, dtype: int64,
 1000)

In [32]:
sample1['gearbox'].value_counts() / total_number_of_observations

manual       0.739
automatic    0.246
Name: gearbox, dtype: float64

In [34]:
"""finding the z statistic"""
statistic_oneprop, pvalue_oneprop = proportions_ztest(
    count=count[1],
    nobs=total_number_of_observations,
    value=0.23,
    alternative='two-sided',
    prop_var=False
)
statistic_oneprop, pvalue_oneprop

(1.1748076237934282, 0.24007168936127266)

In [35]:
"""Using scipy's normal distribution and finding the critical value"""
critical_value_norm = norm.ppf(
    [alpha/2, 1-alpha/2],
)
critical_value_norm


array([-1.95996398,  1.95996398])

In [37]:
"""Two sample test for means"""
"""
Is the mean price of cars that have
 run 30000-60000 KM the same
  as that for cars that have run 70k - 90 k KMS
"""

km_70k_90k = cars[(cars.kilometer <= 90000)& (cars.kilometer >= 70000)]
km_30k_60k = cars[(cars.kilometer <= 60000)& (cars.kilometer >= 30000)]

sample_70k_90k = km_70k_90k.sample(500, random_state=0)
sample_30k_60k = km_30k_60k.sample(500, random_state=0)

In [39]:
"""Sample Variance"""
print(sample_70k_90k.price.var())
print(sample_30k_60k.price.var())

85441991.495996
151420847.25779158


In [40]:
"""Sample Mean"""
print(sample_70k_90k.price.mean())
print(sample_30k_60k.price.mean())

10182.886
14628.582


In [42]:
"""To find the samples are equal or unequal mean"""
"""This is going to be calculated by the f statistic"""
F_statistic = sample_70k_90k.price.var()/sample_30k_60k.price.var()
F_statistic


0.5642683490638007

In [43]:
"""Calculate the degree of freedom for both the samples"""
n1 = len(sample_70k_90k)
n2 = len(sample_30k_60k)

# degrees of freedom n-1


df1 = n1 - 1
df2 = n2 - 1

# cumulative distribution function.
f.cdf(
    F_statistic,
    df1,
    df2
)

1.1099583615573025e-10

In [44]:
f.ppf(
    [alpha/2, 1-alpha/2],
    df1,df2
)

array([0.83888578, 1.1920574 ])

In [46]:
"""Do a weltch t test for unequal variance"""
statistic_two_mean, pvalue_two_mean = ttest_ind(
    sample_30k_60k.price,
    sample_70k_90k.price,
    equal_var=False
)
statistic_two_mean, pvalue_two_mean

(6.459163816617437, 1.6980789910248335e-10)

In [49]:
"""To get critical values we need degrees of freedom"""
n1 = len(sample_30k_60k)
n2 = len(sample_70k_90k)

s1 = sample_30k_60k.price.var()
s2 = sample_70k_90k.price.var()


"""Using the formula shown in the video @19:48"""

df = (
    (
        s1/n1 + s2/n2
    )**2 / (
        (s1/n1)**2 / (n1-1) + (s2/n2)**2 / (n2-1)
    )
)
df


926.1392125693823

In [51]:
critical_value_t = t.ppf(
    [alpha/2, 1-alpha/2],
    df
)
critical_value

array([-1.96234146,  1.64638035])

In [53]:
"""Two sample test for proportion"""
"""
Are the proportion petrol cars in two different time periods 2009-2013 and 2014-2018 different?
"""


"""Subsettings records based on year and drawing samples of 1000 records from each"""

year_2014_2018 = cars[(cars.yearOfRegistration >= 2014) & (cars.yearOfRegistration <= 2018)]
year_2009_2013 = cars[(cars.yearOfRegistration >= 2009) & (cars.yearOfRegistration <= 2013)]

sample_year_2014_2018 = year_2014_2018.sample(1000, random_state=0)
sample_year_2009_2013 = year_2009_2013.sample(1000, random_state=0)


In [64]:
count_1 = sample_year_2014_2018.fuelType.value_counts()
count_2 = sample_year_2009_2013.fuelType.value_counts()

number_of_observations  = [len(sample_year_2014_2018), len(sample_year_2009_2013)]
# number of petrol cars / total number of cars in the sample.
print(count_1[0]/number_of_observations[0])
print(count_2[0]/number_of_observations[1])


0.463
0.487


In [66]:
"""Find the p value and critical value"""
statistic, p_value = proportions_ztest(
    count = count_1[0],
    nobs = number_of_observations[0],
    value = 0,
    alternative = 'two-sided',
    prop_var = False
)

print(statistic, p_value)

critical_value = norm.ppf(
    [alpha/2, 1-alpha/2]
)
critical_value

29.363197934210948 1.621247895904571e-189


array([-1.95996398,  1.95996398])

In [67]:
"""Chi-square test of independence"""
""" Is Vehicle Type (Petrol/Diesel) independent of the year of fuelType?"""


# Cross table between fuelType and vehicleType
cross_table=pd.crosstab(cars['fuelType'], cars['vehicleType'])
cross_table

vehicleType,bus,cabrio,coupe,limousine,others,small car,station wagon,suv
fuelType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cng,31,1,1,6,2,11,16,0
diesel,2191,194,320,3346,142,779,4067,1115
electro,0,0,0,1,0,9,0,0
hybrid,0,0,2,19,1,5,5,3
lpg,70,32,46,198,3,50,117,90
other,0,1,1,3,0,0,1,0
petrol,1050,2398,1610,6218,98,5470,2539,573


In [70]:
scipy.stats.chi2_contingency(cross_table)
# the first output is the chi squared statistics 
# and the second output is the p value, which is 0.0

(6863.792414483225,
 0.0,
 42,
 array([[6.92115121e+00, 5.43834323e+00, 4.10050251e+00, 2.02767778e+01,
         5.09456373e-01, 1.30967565e+01, 1.39686310e+01, 3.68838130e+00],
        [1.23705400e+03, 9.72023877e+02, 7.32904523e+02, 3.62417585e+03,
         9.10578346e+01, 2.34085263e+03, 2.49668738e+03, 6.59243917e+02],
        [1.01781635e+00, 7.99756358e-01, 6.03015075e-01, 2.98187909e+00,
         7.49200548e-02, 1.92599360e+00, 2.05421045e+00, 5.42409015e-01],
        [3.56235724e+00, 2.79914725e+00, 2.11055276e+00, 1.04365768e+01,
         2.62220192e-01, 6.74097762e+00, 7.18973656e+00, 1.89843155e+00],
        [6.16796711e+01, 4.84652353e+01, 3.65427136e+01, 1.80701873e+02,
         4.54015532e+00, 1.16715212e+02, 1.24485153e+02, 3.28699863e+01],
        [6.10689813e-01, 4.79853815e-01, 3.61809045e-01, 1.78912746e+00,
         4.49520329e-02, 1.15559616e+00, 1.23252627e+00, 3.25445409e-01],
        [2.03115432e+03, 1.59599379e+03, 1.20337688e+03, 5.95063792e+03,
         1.495

In [71]:
pd.crosstab(
    cars['fuelType'],
    cars['vehicleType'],
    margins=True
)

vehicleType,bus,cabrio,coupe,limousine,others,small car,station wagon,suv,All
fuelType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cng,31,1,1,6,2,11,16,0,68
diesel,2191,194,320,3346,142,779,4067,1115,12154
electro,0,0,0,1,0,9,0,0,10
hybrid,0,0,2,19,1,5,5,3,35
lpg,70,32,46,198,3,50,117,90,606
other,0,1,1,3,0,0,1,0,6
petrol,1050,2398,1610,6218,98,5470,2539,573,19956
All,3342,2626,1980,9791,246,6324,6745,1781,32835


In [73]:
6863.7924 / 32835


0.20903890360895386