# Core Concepts in Machine Learning

In [2]:
from sklearn.metrics import (
    mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score,
    accuracy_score, precision_score, recall_score, 
    roc_auc_score, roc_curve, auc, confusion_matrix
)
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score 
from sklearn.tree import DecisionTreeClassifier


import pandas as pd

## Metrics

In [3]:
df_reviews = pd.read_csv('https://tinyurl.com/moviereviewsdata')

X = df_reviews[
    [
        'word_count',
        'age',
        'review_year',
        'release_year',
        'length_minutes',
        'children_in_home',
        'total_reviews',
    ]
]

y = df_reviews['rating']
y_class = df_reviews['rating_good']

model_lin_reg = LinearRegression().fit(X, y)

# note that sklearn uses regularization by default for logistic regression
model_log_reg = LogisticRegression().fit(X, y_class)

y_pred_linreg = model_lin_reg.predict(X)
y_pred_logreg = model_log_reg.predict(X)

# regression metrics
rmse = root_mean_squared_error(y, y_pred_linreg)
mae = mean_absolute_error(y, y_pred_linreg)
r2 = r2_score(y, y_pred_linreg)

# classification metrics
accuracy = accuracy_score(y_class, y_pred_logreg)
precision = precision_score(y_class, y_pred_logreg)
recall = recall_score(y_class, y_pred_logreg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Generalization

In [4]:
X = df_reviews[[
        'word_count',
        'age',
        'review_year',
        'release_year',
        'length_minutes',
        'children_in_home',
        'total_reviews',
    ]]

y = df_reviews['rating']

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    random_state=123
)

model_linreg_train = LinearRegression().fit(X_train, y_train)

# get predictions
y_pred_train = model_linreg_train.predict(X_train)
y_pred_test = model_linreg_train.predict(X_test)

# get RMSE
rmse_train = root_mean_squared_error(y_train, y_pred_train)
rmse_test = root_mean_squared_error(y_test, y_pred_test)

pd.DataFrame(
    dict(
        prediction = ['Train', 'Test'],
        rmse = [rmse_train, rmse_test]
    )
).round(3)

Unnamed: 0,prediction,rmse
0,Train,0.515
1,Test,0.53


## Cross-validation

In [5]:
X = df_reviews.filter(regex='_sc$') # grab the standardized features
y = df_reviews['rating_good']

# Cs is the (inverse) penalty parameter;
model_logistic_l2 = LogisticRegressionCV(
    penalty='l2',      # penalty type
    Cs=[1],            # penalty parameter value 
    cv=5, 
    max_iter=1000, 
    verbose=False
).fit(X, y)

# model_logistic_l2.scores_  # show the accuracy score for each fold

# print the average accuracy score
model_logistic_l2.scores_[1].mean()

0.671

## Tuning

In [6]:
# split the dataset from the previous example into
# training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.25, 
    random_state=42
)

# define the parameter values for GridSearchCV
param_grid = {
    'C': [0.1, 1, 2, 5, 10, 20],
}

# perform k-fold cross-validation to select the best penalty parameter
# Note that LogisticRegression by default is ridge regression for scikit-learn
model_logistic_grid = GridSearchCV(
    LogisticRegression(), 
    param_grid=param_grid, 
    cv=5, 
    scoring='accuracy'
).fit(X_train, y_train)

# if you want to inspect the results
best_model = model_logistic_grid.best_estimator_  
best_param = model_logistic_grid.best_params_['C']

# apply the best model to the test set and calculate accuracy
acc_train = model_logistic_grid.score(X_train, y_train)
acc_test  = model_logistic_grid.score(X_test, y_test)

## Pipelines

In [7]:
# create pipeline
logistic_cv_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LogisticRegressionCV(penalty='l2', Cs=[1], cv=5, max_iter=1000),
)

# Fit the pipeline
logistic_cv_pipeline.fit(X_train, y_train)

# Assess the pipeline on test
y_pred = logistic_cv_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

# Save the pipeline
# from joblib import dump, load
# dump(logistic_cv_pipeline, 'logistic_cv_pipeline.joblib')

0.692

## Guided Exploration

In [8]:
# # import the metrics and model you want
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import accuracy_score, roc_auc_score, recall_score 
# from sklearn.tree import DecisionTreeClassifier

# pipeline = make_pipeline(
#     SimpleImputer(strategy='mean'),
#     StandardScaler(),
#     RandomizedSearchCV(
#         DecisionTreeClassifier(), 
#         param_distributions={'max_depth': [2, 5, 7]}, 
#         cv=5, 
#         scoring='???',  # change to some other metric
#     ),
# )


# # extract the best model from the pipeline
# best_model = pipeline.named_steps['randomizedsearchcv'].best_estimator_

# # extract the best parameter from the pipeline
# best_model.max_depth

# # ???(y_test, y_pred) # use your chosen metric on the test set