In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import warnings


warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style('whitegrid')

### Bike Share

Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv
	
	- instant: record index
	- dteday : date
	- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	- yr : year (0: 2011, 1:2012)
	- mnth : month ( 1 to 12)
	- hr : hour (0 to 23)
	- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	- hum: Normalized humidity. The values are divided to 100 (max)
	- windspeed: Normalized wind speed. The values are divided to 67 (max)
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered
    
    
#### Using the model, make predictions of the demand for bikes when the windspeed is minimum and maximum. Plot the ROC curve.

In [2]:
bike_day = pd.read_csv('C:\\Users\\mpagrawa\\Desktop\\Training\\Data Science\\Acad\\BootCamp\\Sessions\\Session24\\Bike-Sharing-Dataset\\day.csv')
bike_hour = pd.read_csv('C:\\Users\\mpagrawa\\Desktop\\Training\\Data Science\\Acad\\BootCamp\\Sessions\\Session24\\Bike-Sharing-Dataset\\hour.csv')

In [3]:
del bike_day['instant']
del bike_day['cnt']

del bike_hour['instant']
del bike_hour['cnt']

In [4]:
bike_day.head()

Unnamed: 0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
0,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654
1,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670
2,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229
3,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454
4,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518


In [5]:
bike_hour.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1


In [10]:
from sklearn.datasets.base import get_data_home 
print (get_data_home())

C:\Users\mpagrawa\scikit_learn_data


In [13]:
from sklearn.datasets import fetch_mldata

In [22]:
mnist = fetch_mldata('MNIST original')
mnist.keys()

dict_keys(['DESCR', 'COL_NAMES', 'target', 'data'])

In [23]:
mnist['COL_NAMES']

['label', 'data']

In [19]:
X = pd.DataFrame(mnist['data'])
y = mnist['target']
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
y

array([0., 0., 0., ..., 9., 9., 9.])

In [26]:
X.shape

(70000, 784)

In [31]:
np.array(X.loc[1])

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  64,
       253, 255,  63,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        96, 205, 251, 253, 205, 111,   4,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [35]:
lm = LogisticRegression()

In [36]:
lm.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
lm.score(X_test, y_test)

0.9124761904761904

In [38]:
df_coef = pd.DataFrame(data=np.transpose(lm.coef_), index=X.columns)

In [40]:
y_pred = lm.predict(X_test)

In [41]:
metrics.confusion_matrix(y_test, y_pred)

array([[1934,    1,    7,    5,    3,    6,   15,    4,   11,    0],
       [   0, 2284,   18,    5,    3,    4,    3,    4,   26,    3],
       [  20,   23, 1767,   41,   19,   11,   21,   29,   60,    4],
       [  10,   11,   48, 1871,    6,   54,    7,   12,   48,   29],
       [   2,   14,   10,    6, 1946,    4,   10,    7,   14,   58],
       [  14,    6,    9,   82,   21, 1625,   38,    9,   66,   29],
       [  17,    8,   22,    1,   19,   38, 2037,    2,   15,    2],
       [   8,   10,   36,    6,   25,    6,    2, 2027,    5,   77],
       [  24,   47,   32,   57,   16,   65,   18,    9, 1814,   42],
       [  14,    8,    9,   32,   68,   19,    1,   86,   22, 1857]],
      dtype=int64)

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
import time
def GridSearch_BestParam(X, y, clf, param_grid):
    cv_sets = ShuffleSplit(X.shape[0],n_iter=10,
                           test_size=0.20,
                           random_state=1234)
    
    grid_search = GridSearchCV(clf,
                              param_grid=param_grid,
                              cv=cv_sets)
    start= time.time()
    print(start)
    grid_search.fit(X,y)
    end = time.time()
    print(end)
    #top_params=grid_search.grid_scores_
    #top_params=grid_search.best_scores_
    top_params=grid_search.best_params_
    return top_params

In [None]:
param_grid = {'C':[0.001,0.05,0.1],
              'penalty':['l2'],
              'solver':['newton-cg','lbfgs','liblinear']}
top_para = GridSearch_BestParam(X_train, y_train, lm, param_grid)

1547575295.963374


In [None]:
top_para

In [None]:
lm1= LogisticRegression(C=top_para['C'],penalty=top_para['penalty'],solver=top_para['solver'])

In [None]:
scores = cross_val_score(lm1,X,y,scoring='accuracy', cv=10)
scores

In [None]:
lm1.fit(X_train,y_train)
y_pred = lm1.predict(X_test)

In [None]:
#Threshold 0.35, records where probability > 0.35 set y 1 else set 0
y1_pred = np.where(lm1.predict_proba(X_test)[:,1] > 0.35,1,0)
print(metrics.classification_report(y_test,y1_pred))