# COURSE: Master Python for scientific programming by solving projects
## PROJECT: Statistics
#### TEACHER: Mike X Cohen, sincxpress.com
##### COURSE URL: udemy.com/course/maspy_x/?couponCode=202201

In [None]:
# import all necessary modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats

# Download and inspect the data

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

data = pd.read_csv(url,sep=';')
data

In [None]:
# describe the data
data.describe()


In [None]:
# list number of unique values per column

for i in data.keys():
  print(f"{i} has {len(np.unique(data[i]))} unique values")

In [None]:
# plot some data
fig,ax = plt.subplots(1,figsize=(17,4))
ax = sns.boxplot(data=data)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)
plt.show()

In [None]:
# remove rows with outliers
data = data[data['total sulfur dioxide']<200]

In [None]:
# some exploration
sns.scatterplot(x=data['residual sugar'],y=data['alcohol'])
plt.show()

# pairwise plots
cols2plot = ['fixed acidity','volatile acidity','citric acid','quality']
sns.pairplot(data[cols2plot],kind='reg',hue='quality')
plt.show()

# T-test for acidity on wine quality




In [None]:
# t-test of volatile acidity on quality=3 vs. 8

x = data['volatile acidity'][data['quality']==3]
y = data['volatile acidity'][data['quality']==8]
ttest = stats.ttest_ind(x,y)

# show the data in a plot
plt.plot(np.random.randn(len(x))/30,x,'o', 1+np.random.randn(len(y))/30,y, 'o')
plt.xlim([-1,2])
plt.xticks([0,1],labels=['Qual 3','Qual 8'])
plt.title(f't={ttest[0]:.2f}, p={ttest[1]:.5f}')
plt.ylabel('volatile acidity')
plt.show()


In [None]:
qualcounts = np.zeros(6)

# gather counts
n = 0
for i in range(3,9):
  qualcounts[n] = len( data[data['quality']==i] )
  n += 1

# show in a bar plot
plt.bar(range(3,9), qualcounts)
plt.xlabel('Quality rating')
plt.ylabel('Count')
plt.show()

In [None]:
# alternative method
counts = data['quality'].value_counts()
plt.bar(list(counts.keys()),counts)
plt.show()

In [None]:
# t-test of volatile acidity on quality=3/4 vs. 7/8

x = data['volatile acidity'][(data['quality']==3) | (data['quality']==4)]
y = data['volatile acidity'][(data['quality']==7) | (data['quality']==8)]
ttest = stats.ttest_ind(x,y )

plt.plot(np.random.randn(len(x))/30,x,'o', 
         1+np.random.randn(len(y))/30,y, 'o',markeredgecolor='k')
plt.xlim([-1,2])
plt.xticks([0,1],labels=['Qual 3+4','Qual 7+8'])
plt.title(f't={ttest[0]:.2f}, p={ttest[1]:.5f}')
plt.ylabel('volatile acidity')
plt.show()


# Multiple regression

In [None]:
import statsmodels.api as sm

In [None]:
# predict quality from all other columns

dep_var  = data['quality']
ind_vars = data.drop(labels='quality',axis=1)

# setup model
ind_vars = sm.add_constant(ind_vars) # add an intercept term 
model = sm.OLS(dep_var,ind_vars).fit()

print(model.summary())

In [None]:
significant_columns = list(model.pvalues[model.pvalues<.05].keys())
[print(i) for i in significant_columns]
significant_columns.append('quality')

# pairwise plots just for significant effects
sns.pairplot(data[significant_columns],kind='reg',hue='quality')
plt.show()


In [None]:
# 


# Logistic regression

In [None]:
# binarize wine quality

binthresh = np.mean(data['quality'])
print(binthresh)

data['binquality'] = data['quality']>binthresh
data

In [None]:
# list of all relevant columns to use in regression

Xcols = []
for key in data.keys():
  if key not in ['quality','binquality']:
    Xcols.append(key)

In [None]:
model = sm.Logit(data['binquality'],data[Xcols])
results = model.fit(method='newton')
results.summary()

In [None]:
# NOTE: There was a typo in the lecture whereby the significant columns from the 
#       standard and logistic regressions were swapped. The code below is correct.

significant_columnsL = list(results.pvalues[results.pvalues<.05].keys())

print("Significant predictors from standard regression:")
[print("  "+i) for i in significant_columns]

print(' ')
print("Significant predictors from logistic regression:")
[print("  "+i) for i in significant_columnsL[:-1]];

# Bonus: Transform to Gaussian

In [None]:
n = 500

x = np.cumsum(np.random.randn(n))

y = (stats.rankdata(x)/(n+1) - .5 )*2
y = np.arctanh(y)
print(np.min(y),np.max(y))

In [None]:
fig,ax = plt.subplots(2,2,figsize=(9,7))

ax[0,0].plot(x)
ax[0,0].set_title('Original data')
ax[0,1].plot(y)
ax[0,1].set_title('Transformed data')

ax[1,0].hist(x,bins=40)
ax[1,0].set_title('Original data')
ax[1,1].hist(y,bins=40)
ax[1,1].set_title('Transformed data')

plt.show()

In [None]:
plt.plot(x,y,'s')
plt.xlabel('Original')
plt.ylabel('Trasformed')
plt.show()