# Dataset

## Library

In [0]:
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
#from adspy_shared_utilities import load_crime_dataset

In [0]:
import numpy
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap, BoundaryNorm
from sklearn import neighbors
import matplotlib.patches as mpatches
import graphviz
from sklearn.tree import export_graphviz
import matplotlib.patches as mpatches
import urllib.request

def load_crime_dataset():
    # Communities and Crime dataset for regression
    # https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime+Unnormalized

    url = 'https://gist.githubusercontent.com/nicolomarana/5719411a4b7d8fde358b91fa3ade7514/raw/6415cc65e7bb63a8356295814a8fa29924439c8e/gistfile1.txt'
    crime = pd.read_table(url, sep=',', na_values='?')
    # remove features with poor coverage or lower relevance, and keep ViolentCrimesPerPop target column
    columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145]  
    crime = crime.iloc[:,columns_to_keep].dropna()

    X_crime = crime.iloc[:,range(0,88)]
    y_crime = crime['ViolentCrimesPerPop']

    return (X_crime, y_crime)

In [0]:
cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])

## Communities and Crime dataset


In [0]:
(X_crime, y_crime) = load_crime_dataset()

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,
                                                   random_state = 0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


linreg = ElasticNet(alpha=3).fit(X_train_scaled, y_train)

print('Crime dataset')
print('linear model intercept: {}'.format(linreg.intercept_))
print('linear model coeff:\n{}'.format(linreg.coef_))

print('R-squared score (training): {:.3f}'.format(linreg.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'.format(linreg.score(X_test_scaled, y_test)))


Crime dataset
linear model intercept: 677.5723754503518
linear model coeff:
[  2.61901217  -0.          -0.           1.60818963   0.
   0.23358636   2.68784394  18.18782645 -12.19901536 -10.64928646
  -5.33691352 -26.19232346   2.28388983  28.83089059  -2.27405126
 -13.06981053  -8.20781366   3.02947971  25.41133417  14.43932829
  22.17983961 -15.00166291  16.86018194 -11.35779084  -2.13552224
  -2.74921335  11.44918617 -14.87223217  27.55736789   9.87532302
  27.12270966  30.6969153    5.11743935 -36.67584998 -41.0227386
 -35.78296228 -31.64676179  -1.01646162  -6.79960623   3.53574605
  29.83670848   0.56824501   5.67473027   8.21025131  11.32045065
  14.29498217   7.42607284   8.20463396   9.18836953   9.64675209
  -9.58429955   8.64349572  13.0274513    8.49742612   0.
  -2.00622328   9.1064632  -27.9141984   12.23754473  20.07332672
 -16.12952051   5.21838807  -5.91803579 -24.24251856  12.73612643
   0.          -6.09148356  26.62044186   6.07014617  -5.63294613
  -6.38823457  -7

In [19]:
print('ElasticNet regression: effect of alpha regularization parameter\n')
for this_alpha in [0, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
    linridge = ElasticNet(alpha = this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
    print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, r-squared training: {:.2f}, r-squared test: {:.2f}\n'
         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))

ElasticNet regression: effect of alpha regularization parameter

Alpha = 0.00
num abs(coeff) > 1.0: 86, r-squared training: 0.67, r-squared test: 0.50

Alpha = 0.09
num abs(coeff) > 1.0: 84, r-squared training: 0.57, r-squared test: 0.57

Alpha = 0.10
num abs(coeff) > 1.0: 85, r-squared training: 0.57, r-squared test: 0.57

Alpha = 0.20
num abs(coeff) > 1.0: 82, r-squared training: 0.52, r-squared test: 0.52

Alpha = 0.30
num abs(coeff) > 1.0: 83, r-squared training: 0.49, r-squared test: 0.49

Alpha = 0.40
num abs(coeff) > 1.0: 84, r-squared training: 0.46, r-squared test: 0.46

Alpha = 0.50
num abs(coeff) > 1.0: 84, r-squared training: 0.44, r-squared test: 0.43

Alpha = 0.60
num abs(coeff) > 1.0: 84, r-squared training: 0.42, r-squared test: 0.41

Alpha = 0.70
num abs(coeff) > 1.0: 84, r-squared training: 0.40, r-squared test: 0.39

Alpha = 0.80
num abs(coeff) > 1.0: 83, r-squared training: 0.38, r-squared test: 0.37



  This is separate from the ipykernel package so we can avoid doing imports until
  positive)
  positive)


Alpha = 0.90
num abs(coeff) > 1.0: 81, r-squared training: 0.36, r-squared test: 0.36

Alpha = 1.00
num abs(coeff) > 1.0: 80, r-squared training: 0.35, r-squared test: 0.34

