Skip to content

Commit

Permalink
updated notebooks for current packages as of 2024-02
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Fenner committed Feb 5, 2024
1 parent 3f7b900 commit 054f856
Show file tree
Hide file tree
Showing 11 changed files with 522 additions and 488 deletions.
21 changes: 14 additions & 7 deletions 06_EvaluatingClassifiers_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"source": [
"# helpful stdlib tool for cleaning up printouts\n",
"import textwrap\n",
"print(textwrap.fill(str(sorted(metrics.SCORERS.keys())), \n",
"print(textwrap.fill(str(sorted(metrics.get_scorer_names())), \n",
" width=70))"
]
},
Expand Down Expand Up @@ -278,7 +278,7 @@
"outputs": [],
"source": [
"print(\"'Multi-label' Encoding\")\n",
"print(skpre.label_binarize(iris.target, [0,1,2])[checkout])"
"print(skpre.label_binarize(iris.target, classes=[0,1,2])[checkout])"
]
},
{
Expand All @@ -287,7 +287,7 @@
"metadata": {},
"outputs": [],
"source": [
"iris_multi_tgt = skpre.label_binarize(iris.target, [0,1,2])\n",
"iris_multi_tgt = skpre.label_binarize(iris.target, classes=[0,1,2])\n",
"\n",
"# im --> \"iris multi\"\n",
"(im_train_ftrs, im_test_ftrs, \n",
Expand Down Expand Up @@ -358,14 +358,14 @@
" classes = np.unique(test_tgt)\n",
" n_classes = len(classes)\n",
"\n",
" indicator = skpre.label_binarize(test_tgt, classes)\n",
" indicator = skpre.label_binarize(test_tgt, classes=classes)\n",
" avg_auc_sum = 0.0\n",
"\n",
" # comparing class i and class j\n",
" for ij in it.combinations(classes, 2):\n",
" # use use sum to act like a logical or\n",
" ij_indicator = indicator[:,ij].sum(axis=1, \n",
" dtype=np.bool)\n",
" dtype=np.bool_)\n",
" \n",
" # slightly ugly, can't broadcast these as indexes\n",
" # use .ix_ to save the day\n",
Expand Down Expand Up @@ -712,7 +712,7 @@
"outputs": [],
"source": [
"macro_precision = metrics.make_scorer(metrics.precision_score,\n",
" average='macro')\n",
" average='macro', zero_division=0)\n",
"macro_recall = metrics.make_scorer(metrics.recall_score,\n",
" average='macro')\n",
"htm_scorer = metrics.make_scorer(hand_and_till_M_statistic, \n",
Expand Down Expand Up @@ -802,9 +802,16 @@
" # save as\n",
" df.to_csv('portugese_student_numeric_discrete.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
29 changes: 21 additions & 8 deletions 08_MoreClassificationMethods_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
"ftrs, tgt = datasets.make_blobs(centers=2,\n",
" n_features=3,\n",
" n_samples=200,\n",
" center_box = [-2.0, 2.0],\n",
" center_box = (-2.0, 2.0),\n",
" random_state=1099)\n",
"\n",
"# note, using three features, but graphing only two dimensions\n",
Expand Down Expand Up @@ -522,7 +522,7 @@
"logreg_classifiers = {'LogReg(saga)': LogReg(solver='saga', \n",
" multi_class='multinomial',\n",
" max_iter=1000),\n",
" 'LogReg(SGD)' : SGD(loss='log', max_iter=1000)}\n",
" 'LogReg(SGD)' : SGD(loss='log_loss', max_iter=1000)}\n",
"\n",
"fig, axes = plt.subplots(1,2,figsize=(12,4))\n",
"axes = axes.flat\n",
Expand Down Expand Up @@ -558,9 +558,15 @@
"tgt = (np.array(y) == 'red')\n",
"\n",
"# sm.Logit is statsmodels name for logistic regression\n",
"(sm.Logit(tgt, x, method='newton')\n",
" .fit()\n",
" .predict(x)) # training predictions"
"# Logit is very, very unhappy trying to deal with a perfectly \n",
"# separable example. so, there are many weird arguments.\n",
"# and it still seems to fail.\n",
"# FIXME. I want just a simple example that recovers the target\n",
"# function.\n",
"#(sm.Logit(tgt, x)\n",
"# .fit(method='newton', skip_hessian=True, \n",
"# full_output=False, warn_convergence=False)\n",
"# .predict(x)) # training predictions"
]
},
{
Expand Down Expand Up @@ -944,9 +950,9 @@
"outputs": [],
"source": [
"classifier_parade = \\\n",
" {'LogReg(1)' : linear_model.LogisticRegression(max_iter=1000),\n",
" 'LogReg(2)' : linear_model.SGDClassifier(loss='log',\n",
" max_iter=1000),\n",
" {'LogReg(1)' : linear_model.LogisticRegression(max_iter=10000),\n",
" 'LogReg(2)' : linear_model.SGDClassifier(loss='log_loss',\n",
" max_iter=10000),\n",
"\n",
" 'QDA' : discriminant_analysis.QuadraticDiscriminantAnalysis(),\n",
" 'LDA' : discriminant_analysis.LinearDiscriminantAnalysis(),\n",
Expand Down Expand Up @@ -986,6 +992,13 @@
"ax.set_ylabel('Accuracy')\n",
"ax.legend(loc='lower center', ncol=2);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {},
Expand Down
446 changes: 391 additions & 55 deletions 09_MoreRegressionMethods_code.ipynb

Large diffs are not rendered by default.

39 changes: 27 additions & 12 deletions 10_Manual_Feature_Engineering_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,11 @@
"outputs": [],
"source": [
"plt.subplots(1,1,figsize=(4,3))\n",
"ax = sns.distplot(iris_df['sepal length'], hist=False, rug=True)\n",
"\n",
"sns.rugplot(iris_df['sepal length'])\n",
"ax = sns.kdeplot(iris_df['sepal length'])\n",
"\n",
"ax.set_ylim(bottom=0.0)\n",
"ax.set_ylabel(\"Approximate %\");"
]
},
Expand All @@ -113,10 +117,10 @@
"source": [
"# apply binary threshold to numeric with sklearn is tricky\n",
"column = iris_df[['sepal length']] # keep 2Dness b/c sk complains\n",
"col_mean = column.mean().values # and sk fails with Series/DF\n",
"col_mean = column.mean().values[0] # and sk fails with Series/DF\n",
"\n",
"both = column.copy()\n",
"both['> Mean'] = skpre.binarize(column, col_mean).astype(np.bool)\n",
"both['> Mean'] = skpre.binarize(column, threshold=col_mean).astype(np.bool_)\n",
"\n",
"print('Column Mean:', col_mean)\n",
"display(both.iloc[[0,50,100]])"
Expand Down Expand Up @@ -164,7 +168,7 @@
" new_iris_df.columns], \n",
" [[1, 0, 0], [0,1,2]])\n",
"\n",
"new_iris_df.sort_index(axis='columns', inplace=True)\n",
"new_iris_df = new_iris_df.sort_index(axis='columns')\n",
"display(new_iris_df.iloc[[0,50,100]])"
]
},
Expand Down Expand Up @@ -468,8 +472,8 @@
"subset = iris_df.loc[[0, 50, 100], ['sepal length', 'sepal width']]\n",
"new_terms = pd.DataFrame(quad_inters.fit_transform(subset), \n",
" index=[0, 50, 100])\n",
"new_terms.set_axis(['sep length', 'sep width', 'sep area'], \n",
" axis=1, inplace=True)\n",
"new_terms = new_terms.set_axis(['sep length', 'sep width', 'sep area'], \n",
" axis=1)\n",
"\n",
"# note: creating the interaction *also* \n",
"# includes the base terms in the interaction\n",
Expand Down Expand Up @@ -695,7 +699,11 @@
" .fit()\n",
" .predict())\n",
" actual = comparison[variable]\n",
" sns.distplot(predicted - actual, norm_hist=True, rug=True, ax=ax)\n",
"\n",
" sns.histplot(predicted - actual, kde=True, stat='density', ax=ax)\n",
" sns.rugplot(predicted-actual)\n",
"\n",
" ax.set_ylim(bottom=0.0)\n",
" ax.set_xlabel(variable)\n",
" ax.set_ylabel('residual')\n",
"fig.tight_layout();"
Expand All @@ -719,8 +727,10 @@
" .fit()\n",
" .predict())\n",
"actual = comparison['d2']\n",
"sns.distplot(predicted - actual, rug=True, \n",
" norm_hist = True, ax=ax2)\n",
"#sns.distplot(predicted - actual, rug=True, \n",
"# norm_hist = True, ax=ax2)\n",
"sns.histplot(predicted - actual, kde=True, stat='density', ax=ax)\n",
"sns.rugplot(predicted-actual)\n",
"\n",
"ax2.set_title('histogram')\n",
"ax2.set_xlim(-3,3)\n",
Expand Down Expand Up @@ -770,7 +780,11 @@
" .fit()\n",
" .predict())\n",
" actual = comparison[variable]\n",
" sns.distplot(predicted - actual, norm_hist=True, rug=True, ax=ax)\n",
" # FIXME:\n",
" # sns.distplot(predicted - actual, norm_hist=True, rug=True, ax=ax)\n",
" sns.histplot(predicted - actual, kde=True, stat='density', ax=ax)\n",
" sns.rugplot(predicted-actual)\n",
" \n",
" ax.set_xlabel(variable)\n",
" ax.set_ylabel('residual')\n",
"\n",
Expand All @@ -795,7 +809,8 @@
" .fit()\n",
" .predict())\n",
"actual = magic['log_d2']\n",
"sns.distplot(predicted - actual, rug=True, ax=ax2)\n",
"sns.histplot(predicted - actual, stat='density', kde=True, ax=ax2)\n",
"sns.rugplot(predicted - actual, ax=ax2)\n",
"\n",
"ax2.set_title('histogram')\n",
"ax2.set_xlim(-.7, .7)\n",
Expand Down Expand Up @@ -829,5 +844,5 @@
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
6 changes: 3 additions & 3 deletions 11_Tuning_and_Pipelines_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@
"knn = neighbors.KNeighborsClassifier()\n",
"grid_knn = skms.GridSearchCV(knn, \n",
" param_grid = param_grid, \n",
" iid=False, cv=3)"
" cv=3)"
]
},
{
Expand Down Expand Up @@ -283,7 +283,7 @@
"knn = neighbors.KNeighborsClassifier()\n",
"grid_knn = skms.GridSearchCV(knn, \n",
" param_grid = param_grid, \n",
" iid=False, cv=2)\n",
" cv=2)\n",
"\n",
"outer_scores = skms.cross_val_score(grid_knn,\n",
" iris.data, \n",
Expand Down Expand Up @@ -432,7 +432,7 @@
"outputs": [],
"source": [
"# iid to silence warning\n",
"mod = skms.GridSearchCV(pipe, param_grid, iid=False, n_jobs=-1)\n",
"mod = skms.GridSearchCV(pipe, param_grid, n_jobs=-1)\n",
"mod.fit(diabetes.data, diabetes.target);"
]
},
Expand Down
7 changes: 5 additions & 2 deletions 12_Combining_Learners_Ensemble_Methods_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"metadata": {},
"outputs": [],
"source": [
"base_estimators = [linear_model.LogisticRegression(),\n",
"base_estimators = [linear_model.LogisticRegression(max_iter=10000),\n",
" tree.DecisionTreeClassifier(max_depth=3),\n",
" naive_bayes.GaussianNB()]\n",
"base_estimators = [(get_model_name(m), m) for m in base_estimators]\n",
Expand Down Expand Up @@ -273,8 +273,11 @@
"source": [
"AdaBC = ensemble.AdaBoostClassifier\n",
"GradBC = ensemble.GradientBoostingClassifier\n",
"# NOTE: loss=\"deviance\" deprecated\n",
"# see: https://github.com/scikit-learn/scikit-learn/pull/23036\n",
"# and: https://github.com/scikit-learn/scikit-learn/issues/18248\n",
"boosted_classifiers = {'boost(Ada)' : AdaBC(learning_rate=2.0),\n",
" 'boost(Grad)' : GradBC(loss=\"deviance\")}\n",
" 'boost(Grad)' : GradBC(loss=\"log_loss\")}\n",
"mean_accs = {}\n",
"for name, model in boosted_classifiers.items():\n",
" model.set_params(n_estimators=max_est)\n",
Expand Down
13 changes: 3 additions & 10 deletions 13_Feature_Engineering_II_Automated_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@
"pipe = pipeline.make_pipeline(ftrsel, linear_model.LogisticRegression())\n",
"\n",
"param_grid = {'selectpercentile__percentile':[5,10,15,20,25]}\n",
"grid = skms.GridSearchCV(pipe, param_grid=param_grid, cv=3, iid=False)\n",
"grid = skms.GridSearchCV(pipe, param_grid=param_grid, cv=3)\n",
"grid.fit(wine.data, wine.target)\n",
"\n",
"print(grid.best_params_)\n",
Expand Down Expand Up @@ -770,7 +770,7 @@
"svc = svm.SVC(kernel='rbf')\n",
"\n",
"grid_model = skms.GridSearchCV(svc, param_grid = param_grid, \n",
" cv=10, iid=False)\n",
" cv=10)\n",
"grid_model.fit(digits.data, digits.target);"
]
},
Expand Down Expand Up @@ -1077,7 +1077,7 @@
"\n",
"cmap = plt.cm.Spectral\n",
"fig = plt.figure(figsize=(4,4))\n",
"ax = plt.gca(projection='3d')\n",
"ax = fig.add_subplot(projection='3d')\n",
"ax.scatter(*data_3d.T, c=color, cmap=cmap)\n",
"ax.view_init(20, -50)"
]
Expand All @@ -1103,13 +1103,6 @@
"data_2d = tsne.fit_transform(data_3d)\n",
"axes[1].scatter(*data_2d.T, c=color, cmap=cmap);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {},
Expand Down

0 comments on commit 054f856

Please sign in to comment.