# Unpaired two-sample t-test

## Libraries and settings

In [None]:
# Libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

# Settings for seaborn
sns.set_theme(style="ticks", palette="pastel")

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Show current working directory
print(os.getcwd())

## Read example data (used car data)

In [None]:
df = pd.read_excel('t-tests_and_ANOVA.xlsx', sheet_name='t-test_data')

# Show categories of 'Make'
print(df['Make'].value_counts())

# Show first rows in dataframe
df.head()

## Create grouped boxplot (groups = make, values = car prices)

In [None]:
# Plot boxplot with groups
plt.figure(figsize=(8,2))
ax = sns.boxplot(x="Price",
                 y="Make",
                 palette=['r', 'g'],
                 data=df,
                 orient='h')
plt.grid()
plt.show()

## Unpaired two-sample t-test

In [None]:
# Create subsets (groups)
vw = df.loc[df['Make'] == 'VW']
bmw = df.loc[df['Make'] == 'BMW']

# Create t-test
result = stats.ttest_ind(vw['Price'], bmw['Price'])

# Print result
print('Test-statistic:', result[0].round(5), 'p-value',  result[1].round(5))

## Interpretation of result

<p> Because the p-value is lower than 0.05, the null hypothesis can be rejected, i.e. there is evidence that the used car prices of VW, BMW differ. Note that, from the t-test above, it is not clear which brand show higher or lower prices. It only shows that there is a statistically significant (5% significance level) difference between prices.</p>

## Importing apartment data

In [None]:
# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_enriched_cleaned.csv', 
                 sep=';', 
                 encoding='utf-8')[['web-scraper-order',
                                    'address_raw',
                                    'lat',
                                    'lon',
                                    'bfs_number',
                                    'bfs_name',
                                    'rooms', 
                                    'area', 
                                    'luxurious', 
                                    'price', 
                                    'price_per_m2',
                                    'pop_dens',
                                    'frg_pct',
                                    'mean_taxable_income',
                                    'dist_supermarket']]

# Get number of rows and columns
print(df.shape)

# Show first records
df.head(5)

## Create new binary variable with densely populated and all other municipalities

In [None]:
df['pop_dens_binary'] = (df['pop_dens'] >= 1000).astype(int)
df.head(5)

## Create pivot table with mean price_per_m2

In [None]:
# Using pivot_table to reshape the data and calculate means 
pd.pivot_table(df[['pop_dens_binary', 'price_per_m2']],
               index=['pop_dens_binary'],
               values=['price_per_m2'],
               aggfunc=[np.mean, 'count'])

## Create grouped boxplot (groups = pop_dens_binary, values = prices_per_m2)

In [None]:
# Plot boxplot with groups
plt.figure(figsize=(8,2))
ax = sns.boxplot(x="price_per_m2",
                 y="pop_dens_binary",
                 data=df,
                 palette=['r', 'g'],
                 orient='h')
plt.grid()
plt.show()

## Unpaired two-sample t-test

In [None]:
# Create subsets (groups)
densely_populated = df.loc[df['pop_dens_binary'] == 0]
not_densely_populated = df.loc[df['pop_dens_binary'] == 1]

# Create t-test
result = stats.ttest_ind(densely_populated['price_per_m2'], 
                         not_densely_populated['price_per_m2'])

# Print result
print('Test-statistic:', result[0].round(5), 'p-value',  result[1].round(5))

## Interpretation of result

<p> Because the p-value is lower than 0.05, the null hypothesis can be rejected, i.e. there is evidence that the prices per m2 differ between densely populated and other municipalities. Note that, from the t-test above, it is not clear which group of municipalities show lower or higher prices. It only shows that there is a statistically significant (5% significance level) difference between prices.</p>

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')