# Overfitting

- What is overfitting and underfitting
- Bias and Variance trade-off


In [62]:
from sklearn.datasets import make_regression

X, y = make_regression(
    n_samples=100, n_features=80, n_informative=5, noise=10, random_state=0
)

In [63]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.9994701710456361

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(75, 80) (75,)
(25, 80) (25,)


In [65]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.881026897881778

In [71]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

X, y = make_classification(
    n_samples=200, n_features=50, n_informative=30, random_state=0
)

clf = LogisticRegression().fit(X, y)
clf.score(X, y)

0.92

In [73]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(150, 50) (150,)
(50, 50) (50,)


In [74]:
clf = LogisticRegression().fit(X_train, y_train)
clf.score(X_test, y_test)

0.8

In [81]:
clf = LogisticRegression(C=2).fit(X_train, y_train)
clf.score(X_test, y_test)

0.8

# Regularization

- What is Regularization
- Difference between L1 and L2 regularization
- How to regularize models in scikit-learn using different methods and values

In [66]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=3).fit(X_train, y_train)
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.9971081709131027
0.7237243057950131


# Cross-validation

- Why use cross-validation
- Types of cross-validation
- Give as examples how to implement Stratified K Fold and Leave One Out

In [127]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=500, n_features=300, n_informative=100, random_state=0
)

In [129]:
from sklearn.model_selection import StratifiedKFold, cross_validate

skf = StratifiedKFold(n_splits=3)
clf = LogisticRegression(max_iter=1000)

cv_results = cross_validate(
    clf, X, y, scoring='accuracy', cv=skf
)

In [130]:
cv_results

{'fit_time': array([0.1697731, 0.1414752, 0.082623 ]),
 'score_time': array([0.00051904, 0.00041699, 0.00033307]),
 'test_score': array([0.79640719, 0.69461078, 0.68072289])}

# Hyper-parameter tuning

- Why do hyper-parameter tuning 
- How to do it with GridSearchCV