# <center> CLASSIFICATION, REGRESSION AND MODEL SELECTION USING PANDAS<br/><br/> CSCAR WORKSHOP <br/><br/> 06/15/2018
## <center> Marcio Duarte Albasini Mourao

# <center> Setup for Anaconda / Jupyter Notebook

<ul>
    <li>Go to the page https://marcio-mourao.github.io/</li>
    <li>Download the materials under "Scikit-Learn: Classification, Regression and Model Selection" to your "username/Documents"</li><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3"</li>
    <li>Click "Anaconda Prompt" </li>
    <ul>
        <li>Enter "conda update pandas"</li>
        <li>Enter "conda update scikit-learn"</li>
    </ul><br/>
    
    <li>Click the Windows button (Bottom Left Corner)</li>
    <li>Click "All apps"</li>
    <li>Click "Anaconda3"</li>
    <li>Click "Jupyter Notebook" </li>
    <li>Click "Workshop.ipynb" (this should open a new tab in the browser)</li>
</ul>

# <center> Introduction

<ul>
  <li>Please, sign up the sheet! </li>
  <li>Don't forget to go to: http://cscar.research.umich.edu/ to know what we're offering!</li>
  <li>Any questions/feedback, you can send an email to <a href="mailto:mdam@umich.edu" target="_top">Marcio.</a>
</ul>

# <center> Summary of this workshop

<ul>
  <li>Summary of Python Data Types</li>
  <li>Regression using Pandas Dataframes</li>
  <li>Classification using Pandas dataframes</li>
</ul>



# <center> References

<ul>
  <li>https://www.continuum.io/anaconda-overview</li>
  <li>http://www.numpy.org/</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html</li>
  <li>http://pandas.pydata.org/pandas-docs/stable/10min.html</li>
  <li>http://matplotlib.org/</li>
  <li>http://www.statsmodels.org/stable/</li>
</ul>

## Imports relevant packages for this session

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Import statsmodels
import statsmodels.formula.api as smf
from scipy.stats.stats import zscore, pearsonr

#Import scikit-learn and patsy modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from patsy import dmatrices

In [None]:
import sys
print(sys.version)

print(np.__version__)
print(pd.__version__)

# <center> Summary of Python Data Types

## Python Simple Data Types
##### Integers
##### Floats
##### Booleans

## Python Data Structures

### Lists

In [None]:
#An example of a list
example_list = [2,4,'fg',8,[3,4]]

print(type(example_list))
print(example_list)
print(example_list[0])
print(example_list[2:4])
print(example_list[-2])
print(example_list[4][0])
example_list[1]=100; print(example_list) # Modifies one element of the list

### Tuples

In [None]:
#An example of a tuple
example_tuple = (2,'df',6,8,10)

print(example_tuple)
print(example_tuple[3])
print(example_tuple[1])
#example_tuple[2]=20 # This should produce an error

### Dictionary

In [None]:
#An example of a dictionary
example_dictionary = {'A':20,'B':40,'C':60}

print(example_dictionary)
print(example_dictionary['B'])
example_dictionary['C']=100
print(example_dictionary)
#print(example_dictionary[0]) # This should produce an error

### Numpy arrays

In [None]:
#An example of a numpy array
example_array = np.array([2,4,'4',8,10])

print(example_array)
print(example_array[0])
print(example_array[2:4])
print(example_array[-2])
example_array[2]=20; print(example_array) # Modifies one element of the numpy array

### Pandas Series
#### A one dimensional labeled array

In [None]:
#An example of a pandas series
example_dictionary = {'A':20,'B':40,'C':60,'D':55}
example_series = pd.Series(example_dictionary)

print(example_series)
print(example_series[0])
print(example_series['A'])
print(example_series['B':])

# <center> Pandas dataframes
### <center> A two-dimensional labeled data structure with columns of potentially different types

In [None]:
#Creation with a list
aux_list=[['ds',1.0],
          ['as',3],
          ['bq',5]]

example_DF = pd.DataFrame(aux_list,index=['Row1','Row2','Row3'],columns=['Col1','Col2'])
example_DF

In [None]:
#Creation with a numpy array
example_DF=pd.DataFrame(np.random.randint(0,10,(3,2)),index=['Row1','Row2','Row3'],columns=['Col1','Col2'])
example_DF

In [None]:
#Creation with a dictionary
example_DF=pd.DataFrame({'Col1':range(3),'Col2':pd.Series([4,5,6],index=[1,2,3])})
example_DF

## <center> Regression using Pandas Dataframes

In [None]:
#Displays signature of the function
?pd.read_csv

In [None]:
#Creates a dataframe with the data named 'wines'
advs = pd.read_csv('advertising.csv')
advs

In [None]:
#Displays the type of the object we are working with
type(advs)

In [None]:
#Obtains the number of lines and columns of the dataframe
advs.shape

In [None]:
#Obtains the dataframe main types
advs.dtypes

In [None]:
#Provides a statistical summary of the data
advs.describe()

In [None]:
#Standardize each column of the dataframe
#advs = advs.apply(zscore, result_type = 'broadcast')
#advs.describe()

In [None]:
#Fit a linear regression model with only one covariate
lrmfit = smf.ols('Sales ~ TV', data=advs).fit()

In [None]:
#Print model summary
print(lrmfit.summary())

In [None]:
#Creates a scatter plot of 'Sales' as a function of 'TV'
plt.figure()
plt.scatter(advs['TV'], advs['Sales'], color='blue')
plt.plot(advs['TV'], lrmfit.predict(), color='red',linewidth=3)
plt.xlabel('TV')
plt.ylabel('Sales')
plt.legend(['Fit','Raw Data']);

In [None]:
#Just comparing the two R2s
r2=pearsonr(advs.TV,advs.Sales)[0]**2

print(r2)
print(lrmfit.rsquared)

In [None]:
#Obtain pearson correlation as well as associated p-value
pr = pearsonr(advs.TV,advs.Sales)

In [None]:
#Obtain regression coefficient from pearson correlation
pr[0]*(np.std(advs.Sales)/np.std(advs.TV))

In [None]:
#Fit a linear regression model with all covariates
mlrmfit = smf.ols('Sales ~ TV + Radio + Newspaper', data=advs).fit()

In [None]:
#Displays the outcome of the model
print(mlrmfit.summary())

In [None]:
#Obtain prediction for a new test set
new_obs_DT=pd.DataFrame([[145,20,25],[170,25,40]], columns=['TV','Radio','Newspaper'])

print(new_obs_DT)
print(mlrmfit.predict(new_obs_DT))

## <center> Classification using Pandas Dataframes

In [None]:
#Creates a dataframe with the data named 'wines'
wines = pd.read_csv('wine.csv')
wines

In [None]:
#Displays the type of the object we are working with
type(wines)

In [None]:
#Obtains the number of lines and columns of the dataframe
wines.shape

In [None]:
#Obtains the dataframe main types
wines.dtypes

In [None]:
#Provides a statistical summary of the data
wines.describe()

In [None]:
#Provides a statistical summary of the data (includes non-numeric variables as well)
wines.describe(include='all')

In [None]:
#Removes selected columns from the dataframe
wines=wines.drop(['white','color','free.sulfur.dioxide','density','quality'],axis=1)
wines.head()

In [None]:
#Another way of getting all columns listed
wines.columns

In [None]:
#Rename columns of the dataframe
wines.columns = [col.replace('.','') for col in wines.columns]

In [None]:
#Converts column income into an integer
wines['good'] = pd.factorize(wines['good'])[0]

In [None]:
#Set formula for classification
formula = 'good ~ -1 + fixedacidity + volatileacidity + citricacid + residualsugar + ' + \
                   'chlorides + totalsulfurdioxide + pH + sulphates + alcohol'

In [None]:
#Obtain the design matrix
y, X = dmatrices(formula, wines, return_type = 'dataframe')

In [None]:
#The dependent variable needs to be a unidimensional vector rather than a dataframe
y = y['good'].values

In [None]:
#Obtain the data for the fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=.2, stratify = y)

In [None]:
#Scale both X_train and X_test datasets separately
X_train = X_train.copy()
X_test = X_test.copy()
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
#Initialize Random Forest
RF = RandomForestClassifier(criterion='gini')

In [None]:
#Set options for model
RFOpts = {'max_features': np.arange(2,7), 'n_estimators': np.arange(10,60,10)}

In [None]:
#Performs search on a grid
gridCV = GridSearchCV(RF, cv=10, param_grid=RFOpts, return_train_score=True)
modelGrid = gridCV.fit(X_train, y_train)

In [None]:
#Displays the best cross-validation score of all grid points
print(np.round(modelGrid.best_score_,2))

In [None]:
#Obtain the model predictions (both probabilities and classes in one single dataframe)
y_pred_prob = modelGrid.predict_proba(X_test) #Obtain probability predictions
y_pred_class = modelGrid.predict(X_test) #Obtain class predictions
y_pred = pd.DataFrame(y_pred_prob, columns=['Prob Class A', 'Prob Class B'])
y_pred['Predicted Class'] = y_pred_class
y_pred.head(10)

In [None]:
#Display the accuracy score
print('Model Accuracy Score: ', np.round(metrics.accuracy_score(y_test, y_pred['Predicted Class']),2))

In [None]:
#Obtains and displays the confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred['Predicted Class'].values)
cm_df = pd.DataFrame(cm, index = ['Real Class A','Real Class B'], columns = ['Predicted Class A','Predicted Class B'])
cm_df = cm_df/cm_df.sum().sum()
cm_df