## Supervised Learning with scikit-learn

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from datetime import datetime

import warnings
warnings.filterwarnings(action='ignore')

### 1) Classification 

In [5]:
#k-nearest neighbors
from sklearn.neighbors import KNeighborsClassifier

X = churn_df[['total_day_charge','total_eve_charge']].values
y = churn_df['churn'].values
print(X.shape, y.shape)

knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X,y)

X_new = np.array([[56,8, 17.5],
                  [24.4, 24.1],
                  [50.1, 10.9]])
print(X_new.shape)

predict = knn.predict(X_new)
print('Predictions: {}'.format(predictions))

- Measuring model performance

In [None]:
# train/test split
from sklearn.model_selection import train_test_split
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=21, stratifiy=y)

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

#model complexity and over/underfitting
train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1,26)
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)

In [None]:
plt.figure(figsize=(8,6))
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(), label='Training Accuracy')
plt.plot(neighbors, test_accuracies.values(), label='Testing Accuracy')
plt.legend()
plt.show()

### 2) Regression

In [None]:
diabetes_df = pd.read_csv('diabetes.csv')
print(diabetes_df.head())

In [None]:
X = diabetes_df.drop('glucose', axis=1).values
y = diabetes_df['glucose'].values
print(type(X), type(y))

In [None]:
x_bmi = X[:,3]
print(y.shape, X_bmi.shape)

In [None]:
#scikit-learn 사용하려면 2차원 배열로 형식화해야함
#변환을 위해, numpy의 reshape 방법 적용
X_bmi = X.bmi.reshape(-1,1)
print(X_bmi.shape)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_bmi, y)
predictions = reg.predict(X_bmi)
plt.scatter(X_bmi, y)
plt.plot(X_bmi, predictions)
plt.show()

#### linear regression
- OLS(Ordinary Least Squares) : RSS(잔차제곱합)을 최소화하는 것을 목표로함

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, y_train, X_test, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)

In [None]:
#linear regression 측정방법 : R_squared & MSE(=mean squared error)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared = False)

#### cross-validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=6, shuffle=True, random_state=42)
reg = LinearRegression()
cv_results = cross_val_score(reg, X, y, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))

#### Regularized regression

In [None]:
#ridge
from sklearn.linear_model import Ridge
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append(ridge.score(X_test, y_test))
print(scores)

In [None]:
#lasso
from sklearn.linear_model import Lasso
scores = []
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    scores.append(lasso.score(X_test, y_test))
print(scores) 

In [None]:
X = diabets_df.drop('glucose', axis=1).values
y = diabets_df['glucose'].values
names = diabetes_df.drop('glucose',axis=1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X,y).coef_
plt.bar(names, lasso_coef)
plt.xticks(rotation=45)
plt.show()

### 3) Fine-Tuning your model

#### classification_report, confusion_matrix

In [1]:
from sklearn.metrics import classification_report, confusion_matrix
knn = KNeighborsClassifier(n_neighbors=7)
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.4, random_state=21)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#### Logistic regression & ROC curve

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=21)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [None]:
#predicting probabilities
y_pred_probs = logreg.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)

In [None]:
from sklearn.metrics import roc_auc_curve
fpr, tpr, thresholds = roc_auc_curve(y_test, y_pred_probs)

#### Hyperparameter tuning

In [None]:
#grid search
from sklearn.model_selection import GridSearchCV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {'alpha':np.arange(0.0001,1,10), 'solver':['sag','lsqr']}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=kf)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)

In [None]:
#randomized search
from sklearn.model_selection import RandomizedSearchCV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
param_grid = {'alpha':np.arange(0.0001,1,10), 'solver':['sag','lsqr']}
ridge = Ridge()
ridge_cv = RandomizedSearchCV(ridge, param_grid, cv=kf, n_iter=2)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)

In [3]:
from sklearn.model_selection import RandomizedSearchCV

### 4) Preprocessing data

##### dealing with categorical teatures in python
- scikit-learn: OneHotEncoder()
- pandas: get_dummies()

In [None]:
music_df = pd.read_csv('music.csv')
music_dummies = pd.get_dummies(music_df['genre'], drop_first=True)

In [None]:
#음수 MSE : neg_mean_squared_error 
#print(np.sqrt(-linreg_cv))

#### missing data

In [None]:
#dropping missing data
df = df.dropna(subset=[columns])
print(df.isna().sum().sort_values())

#imputing values
#mean & median & most_frequent
from sklearn.impute import SimpleImputer

X_cat = music_df['genre'].values.reshape(-1,1)
X_num = music_df.drop(['genre','popularity'], axis=1).values
y = music_df['popularity'].values

X_train_cat, y_train_cat, X_test, y_test = train_test_split(X_cat,y,test_size=0.2,random_state=12)
X_train_num, y_train_num, X_test, y_test = train_test_split(X_num,y,test_size=0.2,random_state=12)

imp_cat = SimpleImputer(strategy='most_frequent')
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

imp_num = SimpleImputer()
X_train_num = imp_num.fit_transform(X_train_cat)
X_test_num = imp_num.transform(X_test_cat)
X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)

In [None]:
#pipeline
from sklearn.pipeline import Pipeline

music_df = music_df.dropna([columns])
#target 열의 값을 1 또는 0 으로 변경
music_df['genre'] = np.where(music_df['genre']=='Rock',1,0)

X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values

steps = [('imputation', SimpleImputer()),
         ('Logistic_regression', LogisticRegression())]
pipeline = Pipeline(steps)
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

#### centering and scaling

In [None]:
from sklearn.preprocessing import StandardScaler
X = music_df.drop('genre', axis=1).values
y = music_df['genre'].values
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(np.mead(X), np.std(X))
print(np.mead(X_train_scaled), np.std(X_test_scaled))