# Cross Validation

## Decision Trees Regression

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
X, y = fetch_california_housing(return_X_y=True)
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model = DecisionTreeRegressor(max_depth=10)
model.fit(X_train, y_train)

print("X_train has %d rows and %d columns"  %(X_train.shape[0],X_train.shape[1]))
print("-----------------------------------")
print("The coefficient of determination for the test data is R2=%.2f"
      %(model.score(X_test, y_test)))
print("The coefficient of determination for the train data is R2=%.2f"
      %(model.score(X_train, y_train)))


X has 20640 rows and 8 columns
y has 20640 rows
X_train has 16512 rows and 8 columns
-----------------------------------
The coefficient of determination for the test data is R2=0.71
The coefficient of determination for the train data is R2=0.85


In [3]:
scores=cross_val_score(model, X_train, y_train, cv=5)
print("Cross validation scores: ", scores)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Cross validation scores:  [0.659441   0.6804379  0.69820968 0.72812757 0.67955477]
Score stats: 0.69 accuracy with a standard deviation of 0.02


## Decision Trees Classification

In [None]:
X, y = load_breast_cancer(return_X_y=True)
print("X has %d rows and %d columns"  %(X.shape[0],X.shape[1]))
print("y has %d rows"  %(y.shape[0]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
model = DecisionTreeClassifier(max_depth = 20)
model.fit(X_train, y_train)
print("The (mean) accuracy on the test set is %.2f" %(model.score(X_test, y_test)))
print("The (mean) accuracy on the train data is %.2f" %(model.score(X_train, y_train)))

In [None]:
scores=cross_val_score(model, X_train, y_train, cv=5) # 5 cv foldings
scores

In [None]:
print("Five-fold cv results: \n %0.2f mean accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
y_test

In [None]:
y_pred = cross_val_predict(model, X_train, y_train, cv=5)
y_pred

### Models Comparison (3 classification estimators)

In [None]:
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()
model3 = KNeighborsClassifier()

model_pipeline = [model1, model2, model3]
model_names = ['Classification Tree', 'Logistic Regression', 'KNN']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=5))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

In [None]:
print("Comparing the 3 regression scores we find \n")

pd.DataFrame([scores], index=["score"])