In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

def part1_scatter():
    import matplotlib.pyplot as plt
    %matplotlib notebook
    plt.figure()
    plt.scatter(X_train, y_train, label='training data')
    plt.scatter(X_test, y_test, label='test data')
    plt.legend(loc=4);

In [65]:
def answer_one():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures

    r = np.zeros((4,100))
    for i, degree in enumerate([1,3,6,9]):
        poly = PolynomialFeatures(degree=degree)
        X_F1_poly = poly.fit_transform(X_train.reshape(11,1))
        linreg = LinearRegression().fit(X_F1_poly, y_train)
        y = linreg.predict(poly.fit_transform(np.linspace(0,10,100).reshape(100,1)))
        r[i,:] = y
    
    return r

In [66]:
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score
    
    r2_train = np.zeros(10)
    r2_test = np.zeros(10)
    for degree in range(10):
        poly = PolynomialFeatures(degree=degree)
        X_F1_poly = poly.fit_transform(X_train.reshape(11,1))
        linreg = LinearRegression().fit(X_F1_poly, y_train)
        r2_train[degree] = linreg.score(X_F1_poly, y_train)
        X_poly = poly.fit_transform(X_test.reshape(4,1))
        r2_test[degree] = linreg.score(X_poly, y_test)


    return (r2_train, r2_test)

In [67]:
def answer_three():
    
    r2_train, r2_test = answer_two()
    diff = r2_train - r2_test
    bestlist = diff[0]
    best = 0
    over = diff[0]
    overfitting = 0
    under = r2_train[0]
    underfitting = 0
    for i in range(10):
        if over < diff[i]:
            over = diff[i]
            overfitting = i
        if under > r2_train[i]:
            under = r2_train[i]
            underfitting = i
        if bestlist > diff[i]:
            bestlist = diff[i]
            best = i
        
    return (underfitting, overfitting, best)

In [68]:
def answer_four():
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import Lasso, LinearRegression
    from sklearn.metrics.regression import r2_score

    poly = PolynomialFeatures(degree=12)
    X_F1_poly = poly.fit_transform(X_train.reshape(11,1))
    linreg = LinearRegression().fit(X_F1_poly, y_train)
    X_poly = poly.fit_transform(X_test.reshape(4,1))
    LinearRegression_R2_test_score = linreg.score(X_poly, y_test)
    linlasso = Lasso(alpha=0.01, max_iter = 10000).fit(X_F1_poly, y_train)
    Lasso_R2_test_score = linlasso.score(X_poly, y_test)
    
    return (LinearRegression_R2_test_score, Lasso_R2_test_score)

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


mush_df = pd.read_csv('mushrooms.csv')
mush_df2 = pd.get_dummies(mush_df)

X_mush = mush_df2.iloc[:,2:]
y_mush = mush_df2.iloc[:,1]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X_mush, y_mush, random_state=0)

X_subset = X_test2
y_subset = y_test2

FileNotFoundError: [Errno 2] File b'mushrooms.csv' does not exist: b'mushrooms.csv'

In [70]:
def answer_five():
    from sklearn.tree import DecisionTreeClassifier
    
    tree_clf = DecisionTreeClassifier().fit(X_train2, y_train2)
    
    feature_names = []
    for index, importance in enumerate(tree_clf.feature_importances_):
        feature_names.append([importance, X_train2.columns[index]])
        
    feature_names.sort(reverse=True)
    feature_names = np.array(feature_names)
    feature_names = feature_names[:5,1]
    feature_names = feature_names.tolist()
    
    return feature_names

In [71]:
def answer_six():
    from sklearn.svm import SVC
    from sklearn.model_selection import validation_curve

    
    param_range = np.logspace(-4, 1, 6)
    train_scores, test_scores = validation_curve(SVC(kernel = 'rbf', C=1, random_state=0), X_subset, y_subset,
                                            param_name='gamma',
                                            param_range=param_range, cv=3, scoring='accuracy')
    

    return (train_scores.mean(axis=1), test_scores.mean(axis=1))

In [72]:
def answer_seven():
    
    r2_train, r2_test = answer_six()
    diff = r2_train - r2_test
    bestlist = diff[0]
    best = 0
    over = diff[0]
    overfitting = 0
    under = r2_train[0]
    underfitting = 0
    for i in range(6):
        if over < diff[i]:
            over = diff[i]
            overfitting = 10**(i-4)
        if under > r2_train[i]:
            under = r2_train[i]
            underfitting = 10**(i-4)
        if bestlist > diff[i]:
            bestlist = diff[i]
            best = 10**(i-4)
        
    return (underfitting, overfitting, best)