# Basics

In [None]:
# Load CSV Using Python Standard Library
import csv
import numpy
filename = ' pima-indians-diabetes.data.csv '
raw_data = open(filename, ' rb ' )
reader = csv.reader(raw_data, delimiter= ' , ' , quoting=csv.QUOTE_NONE)
x = list(reader)
data = numpy.array(x).astype( ' float ' )
print(data.shape)

In [None]:
# Load CSV using NumPy
from numpy import loadtxt
filename = ' pima-indians-diabetes.data.csv '
raw_data = open(filename, ' rb ' )
data = loadtxt(raw_data, delimiter=",")
print(data.shape)

In [None]:
# Load CSV from URL using NumPy
from numpy import loadtxt
from urllib import urlopen
url = ' https://goo.gl/vhm1eU '
raw_data = urlopen(url)
dataset = loadtxt(raw_data, delimiter=",")
print(dataset.shape)

In [None]:
# Load CSV using Pandas
from pandas import read_csv
filename = ' pima-indians-diabetes.data.csv '
names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ]
data = read_csv(filename, names=names)
print(data.shape)

In [None]:
# Data Types for Each Attribute
from pandas import read_csv
filename = "pima-indians-diabetes.data.csv"
names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ]
data = read_csv(filename, names=names)
types = data.dtypes
print(types)

In [None]:
# Statistical Summary
from pandas import read_csv
from pandas import set_option
filename = "pima-indians-diabetes.data.csv"
names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ]
data = read_csv(filename, names=names)
set_option( ' display.width ' , 100)
set_option( ' precision ' , 3)
description = data.describe()
print(description)

In [None]:
#output for above
preg
plas
pres
skin
test
mass
pedi
age
class
count 768.000 768.000 768.000 768.000 768.000 768.000 768.000 768.000 768.000
mean
3.845 120.895 69.105 20.536 79.799 31.993
0.472 33.241
0.349
std
3.370 31.973 19.356 15.952 115.244
7.884
0.331 11.760
0.477
min
0.000
0.000
0.000
0.000
0.000
0.000
0.078 21.000
0.000
25%
1.000 99.000 62.000
0.000
0.000 27.300
0.244 24.000
0.000
50%
3.000 117.000 72.000 23.000 30.500 32.000
0.372 29.000
0.000
75%
6.000 140.250 80.000 32.000 127.250 36.600
0.626 41.000
1.000
max
17.000 199.000 122.000 99.000 846.000 67.100
2.420 81.000
1.000

preg
plas
pres
skin
test
mass
pedi
age
class

count 
768.000 768.000 768.000 768.000 768.000 768.000 768.000 768.000 768.000
mean
3.845 120.895 69.105 20.536 79.799 31.993
0.472 33.241
0.349

std
3.370 31.973 19.356 15.952 115.244
7.884
0.331 11.760
0.477

min
0.000
0.000
0.000
0.000
0.000
0.000
0.078 21.000
0.000

25%
1.000 99.000 62.000
0.000
0.000 27.300
0.244 24.000
0.000

50%
3.000 117.000 72.000 23.000 30.500 32.000
0.372 29.000
0.000

75%
6.000 140.250 80.000 32.000 127.250 36.600
0.626 41.000
1.000

max
17.000 199.000 122.000 99.000 846.000 67.100
2.420 81.000
1.000

In [None]:
# Class Distribution
class_counts = data.groupby( ' class ' ).size()

In [None]:
Correlations Between Attributes

Correlation refers to the relationship between two variables and how they may or may not
change together. 

The most common method for calculating correlation is Pearson’s Correlation
Coefficient, that assumes a normal distribution of the attributes involved. A correlation of -1
or 1 shows a full negative or positive correlation respectively. Whereas a value of 0 shows no
correlation at all. Some machine learning algorithms like linear and logistic regression can suffer
poor performance if there are highly correlated attributes in your dataset. As such, it is a good
idea to review all of the pairwise correlations of the attributes in your dataset. You can use the
corr() function on the Pandas DataFrame to calculate a correlation matrix.

correlations = data.corr(method= ' pearson ' )

In [None]:
Skew of Univariate Distributions

Skew refers to a distribution that is assumed Gaussian (normal or bell curve) that is shifted or
squashed in one direction or another. 

Many machine learning algorithms assume a Gaussian
distribution. Knowing that an attribute has a skew may allow you to perform data preparation
to correct the skew and later improve the accuracy of your models. You can calculate the skew
of each attribute using the skew() function on the Pandas DataFrame.

skew = data.skew()

In [None]:
#Histogram:
data.hist()
pyplot.show()


In [None]:
#Density plots:
data.plot(kind= ' density ' , subplots=True, layout=(3,3), sharex=False)
pyplot.show()

In [None]:
# Box and Whisker Plots
Boxplots summarize the distribution of each attribute, drawing a line for
the median (middle value) and a box around the 25th and 75th percentiles (the middle 50% of
the data). 

The whiskers give an idea of the spread of the data and dots outside of the whiskers
show candidate outlier values (values that are 1.5 times greater than the size of spread of the
middle 50% of the data).

# Box and Whisker Plots
from matplotlib import pyplot
from pandas import read_csv
filename = "pima-indians-diabetes.data.csv"
names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ]
data = read_csv(filename, names=names)
data.plot(kind= ' box ' , subplots=True, layout=(3,3), sharex=False, sharey=False)
pyplot.show()

In [None]:
# Correction Matrix Plot (generic)
from matplotlib import pyplot
from pandas import read_csv
import numpy
filename = ' pima-indians-diabetes.data.csv '
names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ]
data = read_csv(filename, names=names)
correlations = data.corr()
# plot correlation matrix
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
pyplot.show()

In [None]:
Bias - Variance

High Bias - under fitting - not sensitive to changes in data at all
High Variance - over fitting

# Scaling of data
> Rescale data.
> Standardize data.
> Normalize data.
> Binarize data.

In [None]:
Rescale data:
    
When your data is comprised of attributes with varying scales, many machine learning algorithms
can benefit from rescaling the attributes to all have the same scale. Often this is referred to
as normalization and attributes are often rescaled into the range between 0 and 1.

You can rescale your data
using scikit-learn using the MinMaxScaler class 2 .

Ex:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)

In [None]:
Standardization

Standardization is a useful technique to transform attributes with a Gaussian distribution and
differing means and standard deviations to a standard Gaussian distribution with a mean of
0 and a standard deviation of 1. 

It is most suitable for techniques that assume a Gaussian
distribution in the input variables and work better with rescaled data, such as linear regression,
logistic regression and linear discriminate analysis. 

You can standardize data using scikit-learn with the StandardScaler class 3 .

scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)

In [None]:
Normalizing

Normalizing in scikit-learn refers to rescaling each observation (row) to have a length of 1 (called
a unit norm or a vector with the length of 1 in linear algebra). This pre-processing method
can be useful for sparse datasets (lots of zeros) with attributes of varying scales when using
algorithms that weight input values such as neural networks and algorithms that use distance
measures such as k-Nearest Neighbors. 

You can normalize data in Python with scikit-learn
using the Normalizer class 4 .

scaler = Normalizer()
rescaledX = scaler.fit_transform(X)

>>> X = [[ 1., -1.,  2.],
...      [ 2.,  0.,  0.],
...      [ 0.,  1., -1.]]
>>> X_normalized = preprocessing.normalize(X, norm='l2')

>>> X_normalized                                      
array([[ 0.40..., -0.40...,  0.81...],
       [ 1.  ...,  0.  ...,  0.  ...],
       [ 0.  ...,  0.70..., -0.70...]])


In [None]:
Binarize Data (Make Binary)

You can transform your data using a binary threshold. All values above the threshold are
marked 1 and all equal to or below are marked as 0.

binarizer = Binarizer(threshold=0.0).fit(X)
binaryX = binarizer.transform(X)

# Feature Selection For Machine Learning

4 different automatic feature selection techniques:
> Univariate Selection.
> Recursive Feature Elimination.
> Principle Component Analysis.
> Feature Importance.

In [None]:
Statistical tests can be used to select those features that have the strongest relationship with
the output variable. 

The scikit-learn library provides the SelectKBest class that can be used
with a suite of different statistical tests to select a specific number of features.

The example below uses the chi-squared (chi 2 ) statistical test for non-negative features to select 4 of the best
features from the Pima Indians onset of diabetes dataset.

In [None]:
# feature extraction
test = SelectKBest(score_func=chi2, k=4) # 4 features
fit = test.fit(X, Y)

# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

# summarize selected features
print(features[0:5,:]) # will contain only 4 columns (k=4)

or]
features = SelectKBest(score_func=chi2,k=4).fit(X,Y).transform(X)
print (features[0:5,:])

In [None]:
Recursive Feature Elimination:
    
The Recursive Feature Elimination (or RFE) works by recursively removing attributes and
building a model on those attributes that remain. 

It uses the model accuracy to identify which
attributes (and combination of attributes) contribute the most to predicting the target attribute.

The example below uses RFE with the logistic regression algorithm to select the top 3 features.

In [None]:
#The Recursive Feature Elimination
model = LogisticRegression()
rfe = RFE(model,3) #number of features = 3

fit = rfe.fit(X,Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

Output:
Num Features: 3
Selected Features: [ True False False False False True True False]
Feature Ranking: [1 2 3 5 6 1 1 4]
    

In [None]:
Principal Component Analysis (or PCA):
Uses linear algebra to transform the dataset into a compressed form. Generally this is called a data reduction technique. A property of PCA is that
you can choose the number of dimensions or principal components in the transformed result.

In [None]:
# PCA feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)

In [None]:
Feature Importance

Bagged decision trees like Random Forest and Extra Trees can be used to estimate the importance
of features. 

In the example below we construct a ExtraTreesClassifier classifier for the Pima
Indians onset of diabetes dataset. 

In [None]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

#Output:
    [ 0.11070069 0.2213717 0.08824115 0.08068703 0.07281761 0.14548537 0.12654214 0.15415431]

### Evaluation techniques:
> Train and Test Sets.

> k-fold Cross Validation.

> Leave One Out Cross Validation.

> Repeated Random Test-Train Splits.

In [None]:

> Train and Test Sets.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33,
random_state=seed)

model = LogisticRegression()
model.fit(X_train, Y_train)

> K-fold:
kfold = KFold(n_splits=num_folds, random_state=seed)

model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)


> Leave one out:
loocv = LeaveOneOut()

model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=loocv)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

> Repeated Random Test-Train Splits:
kfold = ShuffleSplit(n_splits=10, test_size=0.33, random_state=seed)

model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)


### Classification Metrics:

> Classification Accuracy.

> Logarithmic Loss.

> Area Under ROC Curve.

> Confusion Matrix.

> Classification Report.

In [None]:

> Classification Accuracy.
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()

scoring = ' accuracy '

results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)") % (results.mean(), results.std())

> Logarithmic Loss.
scoring = ' neg_log_loss '

> Area Under ROC Curve:
scoring = ' roc_auc '

Sensitivity is the true positive rate also called the recall. It is the number of instances
from the positive (first) class that actually predicted correctly.

Specificity is also called the true negative rate. Is the number of instances from the
negative (second) class that were actually predicted correctly.

ex: If AUC is 0.8 - relatively close to 1 and greater than 0.5, suggesting some skill in the predictions

> Confusion Matrix.

The table presents predictions on the x-axis and accuracy outcomes on the y-axis.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)

matrix = confusion_matrix(Y_test, predicted)


> Classification Report.

Quick idea of the accuracy of a model using a number of measure like precision, recall, F1-score and support for each
class.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)

report = classification_report(Y_test, predicted)



### Regression Metrics:
    
> Mean Absolute Error.

> Mean Squared Error.

> R squared/coefficient of determination.

In [None]:

> Mean Absolute Error.
kfold = KFold(n_splits=10, random_state=7)
model = LinearRegression()
scoring = ' neg_mean_absolute_error '
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

A value of 0 indicates no error or perfect predictions

> Mean Squared Error.
scoring = ' neg_mean_squared_error '
results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

> R squared/coefficient of determination.
scoring = ' r2 '

This is a value between 0 and 1 for no-fit and perfect fit respectively.



# Six classification algorithms

In [None]:
Linear learning algorithms:
    
1. Logistic Regression.
2. Linear Discriminant Analysis.

Nonlinear learning algorithms:
3. k-Nearest Neighbors.
4. Naive Bayes.
5. Classification and Regression Trees.
6. Support Vector Machines

## Linear regression vs Logistic regression

In linear regression, the outcome (dependent variable) is continuous. It can have any one of an infinite number of possible values. 

In logistic regression, the outcome (dependent variable) has only a limited number of possible values.

For instance, if X contains the area in square feet of houses, and Y contains the corresponding sale price of those houses, you could use linear regression to predict selling price as a function of house size. While the possible selling price may not actually be any, there are so many possible values that a linear regression model would be chosen.

If, instead, you wanted to predict, based on size, whether a house would sell for more than 200K, you would use logistic regression. The possible outputs are either Yes, the house will sell for more than 200K, or No, the house will not.



#### What does linear mean?

Linear regression requires a linear model. No surprise, right? But what does that really mean?

A model is linear when each term is either a constant or the product of a parameter and a predictor variable. A linear equation is constructed by adding the results for each term. This constrains the equation to just one basic form:

Response = constant + parameter * predictor + ... + parameter * predictor

Y = b o + b1X1 + b2X2 + ... + bkXk

While a linear equation has one basic form, nonlinear equations can take many different forms.
Literally, it’s not linear. If the equation doesn’t meet the criteria above for a linear equation, it’s nonlinear.

## Regression Algorithms:
#### Linear machine learning algorithms:
1. Linear Regression.
2. Ridge Regression.
3. LASSO Linear Regression.
4. Elastic Net Regression.

#### Nonlinear machine learning algorithms:
5. k-Nearest Neighbors.
6. Classification and Regression Trees.
7. Support Vector Machines.

## Classification algorithms:

#### Linear algorithms
1. Logistic Regression 
2. Linear Discriminant Analysis
   
#### Nonlinear algorithms:
3. k-Nearest Neighbors
4. Naive Bayes
5. Classification and Regression

In [None]:
model_svm=SVC()
results_svm=cross_val_score(model_svm,X,Y,cv=kfold)
print ("SVM: %s" % results_svm.mean())
SVM: 0.651025290499

model_knn = KNeighborsClassifier()
results_knn = cross_val_score(model_knn,X,Y,cv=kfold)
print ("KNN: %s" % results_knn.mean())
KNN: 0.726555023923


model_decision = DecisionTreeClassifier()
results_decision = cross_val_score(model_decision,X,Y,cv=kfold)
print ("Descision Tree: %s" % results_decision.mean())
Descision Tree: 0.691285030759

    
model_LDA = LinearDiscriminantAnalysis()
results_model_LDA = cross_val_score(model_LDA,X,Y,cv=kfold)
print ("LDA: %s" % results_model_LDA.mean())
LDA: 0.773462064252

model_LogisticRegression = LogisticRegression()
results_LogisticRegression = cross_val_score(model_LogisticRegression,X,Y,cv=kfold)
print ("Logistic Regression: %s" % results_LogisticRegression.mean())
Logistic Regression: 0.76951469583

    
model_decision = DecisionTreeClassifier()
results_decision = cross_val_score(model_decision,X,Y,cv=kfold)
print ("Descision Tree: %s" % results_decision.mean())
Descision Tree: 0.699213943951