# Bike Sharing 

### Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### Importing Data

In [2]:
df = pd.read_csv('2017-Q1-Trips-History-Data 2.csv')

In [3]:
df.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member Type
0,1048876,3/31/2017 23:59,4/1/2017 0:17,31213,17th & K St NW,31606,Potomac & Pennsylvania Ave SE,W20784,Registered
1,223449,3/31/2017 23:59,4/1/2017 0:03,31104,Adams Mill & Columbia Rd NW,31103,16th & Harvard St NW,W20825,Registered
2,423494,3/31/2017 23:58,4/1/2017 0:05,31627,M St & Delaware Ave NE,31614,11th & H St NE,W20773,Registered
3,687015,3/31/2017 23:57,4/1/2017 0:08,31404,9th & Upshur St NW,31281,8th & O St NW,W01307,Registered
4,257919,3/31/2017 23:57,4/1/2017 0:02,31602,Park Rd & Holmead Pl NW,31400,Georgia & New Hampshire Ave NW,W21760,Registered


In [4]:
df.isnull().sum()

Duration                0
Start date              0
End date                0
Start station number    0
Start station           0
End station number      0
End station             0
Bike number             0
Member Type             0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646508 entries, 0 to 646507
Data columns (total 9 columns):
Duration                646508 non-null int64
Start date              646508 non-null object
End date                646508 non-null object
Start station number    646508 non-null int64
Start station           646508 non-null object
End station number      646508 non-null int64
End station             646508 non-null object
Bike number             646508 non-null object
Member Type             646508 non-null object
dtypes: int64(3), object(6)
memory usage: 44.4+ MB


In [6]:
df.head(n=2)

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member Type
0,1048876,3/31/2017 23:59,4/1/2017 0:17,31213,17th & K St NW,31606,Potomac & Pennsylvania Ave SE,W20784,Registered
1,223449,3/31/2017 23:59,4/1/2017 0:03,31104,Adams Mill & Columbia Rd NW,31103,16th & Harvard St NW,W20825,Registered


In [7]:
df['Bike number']= df['Bike number'].apply(lambda x: str(x)[1:])

In [8]:
df.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member Type
0,1048876,3/31/2017 23:59,4/1/2017 0:17,31213,17th & K St NW,31606,Potomac & Pennsylvania Ave SE,20784,Registered
1,223449,3/31/2017 23:59,4/1/2017 0:03,31104,Adams Mill & Columbia Rd NW,31103,16th & Harvard St NW,20825,Registered
2,423494,3/31/2017 23:58,4/1/2017 0:05,31627,M St & Delaware Ave NE,31614,11th & H St NE,20773,Registered
3,687015,3/31/2017 23:57,4/1/2017 0:08,31404,9th & Upshur St NW,31281,8th & O St NW,1307,Registered
4,257919,3/31/2017 23:57,4/1/2017 0:02,31602,Park Rd & Holmead Pl NW,31400,Georgia & New Hampshire Ave NW,21760,Registered


In [9]:
df.tail()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member Type
646503,1356956,1/1/2017 0:07,1/1/2017 0:29,31289,Henry Bacon Dr & Lincoln Memorial Circle NW,31222,New York Ave & 15th St NW,21945,Casual
646504,1327901,1/1/2017 0:07,1/1/2017 0:29,31289,Henry Bacon Dr & Lincoln Memorial Circle NW,31222,New York Ave & 15th St NW,20012,Casual
646505,1636768,1/1/2017 0:07,1/1/2017 0:34,31258,Lincoln Memorial,31270,8th & D St NW,22786,Casual
646506,1676854,1/1/2017 0:06,1/1/2017 0:34,31258,Lincoln Memorial,31270,8th & D St NW,894,Casual
646507,221834,1/1/2017 0:00,1/1/2017 0:04,31634,3rd & Tingey St SE,31208,M St & New Jersey Ave SE,869,Registered


### Preprocessing data 

In [10]:
df['Member Type'].unique()

array(['Registered', 'Casual'], dtype=object)

In [11]:
df['Member Type']=df['Member Type'].map({'Registered':1, 'Casual':2}).astype(int)

In [12]:
df['Bike number']=pd.to_numeric(df['Bike number'], errors='coerce')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646508 entries, 0 to 646507
Data columns (total 9 columns):
Duration                646508 non-null int64
Start date              646508 non-null object
End date                646508 non-null object
Start station number    646508 non-null int64
Start station           646508 non-null object
End station number      646508 non-null int64
End station             646508 non-null object
Bike number             646494 non-null float64
Member Type             646508 non-null int64
dtypes: float64(1), int64(4), object(4)
memory usage: 44.4+ MB


In [14]:
df.head(n=10)

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member Type
0,1048876,3/31/2017 23:59,4/1/2017 0:17,31213,17th & K St NW,31606,Potomac & Pennsylvania Ave SE,20784.0,1
1,223449,3/31/2017 23:59,4/1/2017 0:03,31104,Adams Mill & Columbia Rd NW,31103,16th & Harvard St NW,20825.0,1
2,423494,3/31/2017 23:58,4/1/2017 0:05,31627,M St & Delaware Ave NE,31614,11th & H St NE,20773.0,1
3,687015,3/31/2017 23:57,4/1/2017 0:08,31404,9th & Upshur St NW,31281,8th & O St NW,1307.0,1
4,257919,3/31/2017 23:57,4/1/2017 0:02,31602,Park Rd & Holmead Pl NW,31400,Georgia & New Hampshire Ave NW,21760.0,1
5,253959,3/31/2017 23:56,4/1/2017 0:00,31123,14th & Girard St NW,31102,11th & Kenyon St NW,23036.0,1
6,402669,3/31/2017 23:56,4/1/2017 0:03,31263,10th & K St NW,31109,7th & T St NW,22539.0,1
7,744377,3/31/2017 23:56,4/1/2017 0:09,31236,37th & O St NW / Georgetown University,31917,N Lynn St & Fairfax Dr,20893.0,2
8,776845,3/31/2017 23:57,4/1/2017 0:10,31111,10th & U St NW,31404,9th & Upshur St NW,945.0,1
9,212823,3/31/2017 23:55,3/31/2017 23:58,31124,14th & Irving St NW,31117,15th & Euclid St NW,22998.0,1


In [15]:
df['Start date']=pd.to_datetime(df['Start date'])

In [16]:
df['End date']= pd.to_datetime(df['End date'])

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 646508 entries, 0 to 646507
Data columns (total 9 columns):
Duration                646508 non-null int64
Start date              646508 non-null datetime64[ns]
End date                646508 non-null datetime64[ns]
Start station number    646508 non-null int64
Start station           646508 non-null object
End station number      646508 non-null int64
End station             646508 non-null object
Bike number             646494 non-null float64
Member Type             646508 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(4), object(2)
memory usage: 44.4+ MB


In [18]:
df.isnull().sum()

Duration                 0
Start date               0
End date                 0
Start station number     0
Start station            0
End station number       0
End station              0
Bike number             14
Member Type              0
dtype: int64

In [19]:
df['Bike number']=df['Bike number'].fillna(df['Bike number'].median())

### Spliting the data into dependent and Independent variables

In [20]:
y= df['Member Type'].values

In [21]:
X= df.drop(['Member Type','Start date','End date','Start station', 'End station'], axis=1).values

In [22]:
y.shape

(646508,)

In [23]:
X.shape

(646508, 4)

In [27]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=42)

### 1. knn 

In [28]:
knn= KNeighborsClassifier()

In [29]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [30]:
y_pred= knn.predict(X_test)

In [31]:
knn.score(X_test, y_test)

0.85249519213417679

In [32]:
confusion_matrix(y_test,y_pred)

array([[148880,   9960],
       [ 18649,  16464]])

In [33]:
classification_report(y_test,y_pred)

'             precision    recall  f1-score   support\n\n          1       0.89      0.94      0.91    158840\n          2       0.62      0.47      0.54     35113\n\navg / total       0.84      0.85      0.84    193953\n'

** Parameter tuning **

In [34]:
param_grid = {'n_neighbors': np.arange(1,10),
               'weights': ['uniform', 'distance']}

In [35]:
knn_cv = GridSearchCV(knn, param_grid)

In [37]:
knn_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [38]:
y_pred1= knn_cv.predict(X_test)

In [77]:
knnscore=knn_cv.score(X_test, y_test)

In [78]:
knnscore

0.8628172804751667

In [40]:
knn_cv.best_params_

{'n_neighbors': 8, 'weights': 'uniform'}

In [41]:
confusion_matrix(y_test, y_pred1)

array([[152803,   6037],
       [ 20570,  14543]])

In [42]:
classification_report(y_test, y_pred1)

'             precision    recall  f1-score   support\n\n          1       0.88      0.96      0.92    158840\n          2       0.71      0.41      0.52     35113\n\navg / total       0.85      0.86      0.85    193953\n'

### 2. LogisticRegression

In [43]:
log = LogisticRegression()

In [44]:
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
y_pred2= log.predict(X_test)

In [46]:
log.score(X_test, y_test)

0.86523023619124217

** Parameter tuning**

In [47]:
param_grid = {'C':[0.001,0.01,0.1,1,10],
               'penalty':['l1','l2']}

In [48]:
log_cv= GridSearchCV(log, param_grid)

In [49]:
log_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [50]:
y_pred3= log_cv.predict(X_test)

In [79]:
logscore=log_cv.score(X_test,y_test)

In [80]:
logscore

0.86531788629203987

In [52]:
log_cv.best_params_

{'C': 1, 'penalty': 'l1'}

### 3. RandomForest 

In [53]:
random = RandomForestClassifier()

In [54]:
random.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [55]:
y_pred4= random.predict(X_test)

In [56]:
random.score(X_test, y_test)

0.89868163936623824

** Parameter tuning**

In [60]:
param={'n_estimators':[120,200],
        'max_depth':[5,8,15],
          'max_features':['auto']}

In [61]:
random_cv= GridSearchCV(random, param)

In [62]:
random_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [120, 200], 'max_depth': [5, 8, 15], 'max_features': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [81]:
randomscore=random_cv.score(X_test, y_test)

In [82]:
randomscore

0.90056869447752808

In [64]:
random_cv.best_params_

{'max_depth': 15, 'max_features': 'auto', 'n_estimators': 200}

In [65]:
y_pred5 =random_cv.predict(X_test)

In [66]:
confusion_matrix(y_test,y_pred5)

array([[154670,   4170],
       [ 15115,  19998]])

### 4. SVM

In [67]:
svm = LinearSVC()

In [68]:
svm.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [69]:
y_pred6 = svm.predict(X_test)

In [70]:
svm.score(X_test, y_test)

0.81896129474666546

In [71]:
param_grid ={'C':[0.001,0.01,0.1,1,10]}

In [72]:
svm_cv = GridSearchCV(svm, param_grid)

In [73]:
svm_cv.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [74]:
y_pred7 = svm_cv.predict(X_test)

In [83]:
svmscore=svm_cv.score(X_test, y_test)

In [84]:
svmscore

0.81896129474666546

In [76]:
svm_cv.best_params_

{'C': 10}

In [87]:
Models= pd.DataFrame({'Models':['Knn','Logistic', 'Randomforest', 'Svm'],
                     'Scores': [knnscore, logscore, randomscore, svmscore]
    
})

In [88]:
Models

Unnamed: 0,Models,Scores
0,Knn,0.862817
1,Logistic,0.865318
2,Randomforest,0.900569
3,Svm,0.818961


** lets select RandomForest model for prediction as it got more accuracy when compare to other models.**