In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
heart_disease = pd.read_csv("../data/heart-disease.csv")
heart_disease.head()

In [None]:
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = RandomForestClassifier(n_jobs=-1).fit(X_train, y_train)
y_preds = model.predict(X_test)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
# Import LinearSVC from sklearn's svm module
from sklearn.svm import LinearSVC

linear_scv = LinearSVC(dual="auto").fit(X_train, y_train)
linear_scv.score(X_test, y_test)

In [None]:
# Import KNeighborsClassifier from sklearn's neighbors module
from sklearn.neighbors import KNeighborsClassifier

nbrs = KNeighborsClassifier().fit(X_train, y_train)
nbrs.score(X_test, y_test)

In [None]:
# Import SVC from sklearn's svm module
from sklearn.svm import SVC

svc = SVC().fit(X_train, y_train)
svc.score(X_test, y_test)

In [None]:
# Import LogisticRegression from sklearn's linear_model module
from sklearn.linear_model import LogisticRegression

logistic_reg = LogisticRegression(max_iter=1000).fit(X_train, y_train)
logistic_reg.score(X_test, y_test)

In [None]:
score_dict = {
    "RandomForestClassifier": RandomForestClassifier(),
    "LinearSVC": LinearSVC(dual="auto"),
    "KneighborsClassifier": KNeighborsClassifier(),
    "SVC": SVC(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
}

results_dict = {}

for name, model in score_dict.items():
    model.fit(X_train, y_train)
    results_dict[name] = model.score(X_test, y_test)
    
    
print(results_dict)

In [None]:
results_df = pd.DataFrame(data=results_dict.values(), index=results_dict.keys(), columns=["Accuracy"])
results_df

In [None]:
results_df.plot.bar(figsize=(10, 6));

In [None]:
LogisticRegression().get_params()

In [None]:
log_reg_grid = {
    "C": np.logspace(-4, 4, 20),
    "max_iter": [6000, 7000, 6500],
    "random_state": [25, 30, 35, 40,],
}

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)

In [None]:
rs_log_reg = RandomizedSearchCV(log_reg, log_reg_grid, cv=5, n_iter=20, verbose=1).fit(X_train, y_train)

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
rs_log_reg.best_params_

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, precision_score, f1_score, recall_score, roc_curve, auc

In [None]:
y_preds = rs_log_reg.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_preds)

In [None]:
# Plot the confusion matrix
import seaborn as sn

plt.figure(figsize=(10, 7))
sn.heatmap(cm, annot=True, fmt="g", xticklabels=["0", "1"], yticklabels=["0", "1"]);

In [None]:
print(classification_report(y_test, y_preds))

In [None]:
precision_score(y_test, y_preds)

In [None]:
recall_score(y_test, y_preds)

In [None]:
f1_score(y_test, y_preds)

In [None]:
y_proba = rs_log_reg.predict_proba(X_test)

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_proba[:, 1])

In [None]:
roc_auc = auc(fpr, tpr)

In [None]:
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot();

In [None]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(rs_log_reg, X, y, cv=5, scoring="accuracy")
cvs

In [None]:
cvs.mean()

In [None]:
cvs_precision = cross_val_score(rs_log_reg, X, y, cv=5, scoring="precision")
cvs_precision.mean()

In [None]:
cvs_recall = cross_val_score(rs_log_reg, X, y, cv=5, scoring="recall")
cvs_recall.mean()

In [None]:
cvs_f1 = cross_val_score(rs_log_reg, X, y, cv=5, scoring="f1")
cvs_f1.mean()

# Regression practice

In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
car_sales = pd.read_csv("../data/car-sales-extended-missing-data.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [4]:
car_sales.info

<bound method DataFrame.info of        Make Colour  Odometer (KM)  Doors    Price
0     Honda  White        35431.0    4.0  15323.0
1       BMW   Blue       192714.0    5.0  19943.0
2     Honda  White        84714.0    4.0  28343.0
3    Toyota  White       154365.0    4.0  13434.0
4    Nissan   Blue       181577.0    3.0  14043.0
..      ...    ...            ...    ...      ...
995  Toyota  Black        35820.0    4.0  32042.0
996     NaN  White       155144.0    3.0   5716.0
997  Nissan   Blue        66604.0    4.0  31570.0
998   Honda  White       215883.0    4.0   4001.0
999  Toyota   Blue       248360.0    4.0  12732.0

[1000 rows x 5 columns]>

In [5]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [6]:
car_sales.dropna(subset="Price", inplace=True)

In [7]:
car_sales.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [8]:
cat_imputer = make_pipeline((SimpleImputer(strategy="constant", fill_value="missing")),
                       (OneHotEncoder()))
door_imputer = make_pipeline((SimpleImputer(strategy="constant", fill_value=4)))
odometer_imputer = make_pipeline((SimpleImputer(strategy="mean")))

preprocessor = make_column_transformer((cat_imputer, ["Make", "Colour"]),
                                      (door_imputer, ["Doors"]),
                                      (odometer_imputer, ["Odometer (KM)"]), remainder="passthrough")

In [9]:
regression_models = {
    "Ridge": Ridge(),
    "SVR_Linear": SVR(kernel="linear"),
    "SVR_kbf": SVR(),
    "RandomForest": RandomForestRegressor()
}

regression_results = {}

In [10]:
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((760, 4), (190, 4), (760,), (190,))

In [12]:
for model_name, model in regression_models.items():
    model_pipeline = make_pipeline((preprocessor),
                                  (model))
    model_pipeline.fit(X_train, y_train)
    regression_results[model_name] = model_pipeline.score(X_test, y_test)

In [13]:
regression_results

{'Ridge': 0.4597349015905994,
 'SVR_Linear': 0.11459181968202481,
 'SVR_kbf': -0.040280948783683046,
 'RandomForest': 0.3619099486424797}