In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from os import path 
from collections import Counter

import pandas as pd
import datetime

In [6]:
if not path.exists('data/transformed_data.csv'):
    
    def t(v):
        r = []
        for i in v:
            if i.isalpha():
                i = ord(i)
            r.append(str(i))
        return ''.join(r)

    df = pd.read_csv('data/20140711.CSV', dtype={
        'TripID': int, 
        'RouteID': str, 
        'StopID': str, 
        'StopName': str, 
        'WeekBeginning': str, 
        'NumberOfBoardings': str
    })
    
    df['RouteID'] = df['RouteID'].apply(t)
    
    df.to_csv('data/transformed_data.csv')

In [7]:
df = pd.read_csv('data/transformed_data.csv', dtype={
        'TripID': int, 
        'RouteID': str, 
        'StopID': str, 
        'StopName': str, 
        'WeekBeginning': str, 
        'NumberOfBoardings': str
    })

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,TripID,RouteID,StopID,StopName,WeekBeginning,NumberOfBoardings
0,0,23631,100,14156,181 Cross Rd,2013-06-30 00:00:00,1
1,1,23631,100,14144,177 Cross Rd,2013-06-30 00:00:00,1
2,2,23632,100,14132,175 Cross Rd,2013-06-30 00:00:00,1
3,3,23633,100,12266,Zone A Arndale Interchange,2013-06-30 00:00:00,2
4,4,23633,100,14147,178 Cross Rd,2013-06-30 00:00:00,1


In [9]:
df.describe(include = 'all')

Unnamed: 0.1,Unnamed: 0,TripID,RouteID,StopID,StopName,WeekBeginning,NumberOfBoardings
count,5483114.0,5483114.0,5483114.0,5483114.0,5483114,5483114,5483114.0
unique,,,194.0,4279.0,2437,54,286.0
top,,,300.0,16279.0,2 King William Rd,2014-03-02 00:00:00,1.0
freq,,,228373.0,29805.0,36369,111029,2212135.0
mean,2741556.0,26084.59,,,,,
std,1582839.0,17740.1,,,,,
min,0.0,79.0,,,,,
25%,1370778.0,11492.0,,,,,
50%,2741556.0,27121.0,,,,,
75%,4112335.0,45196.0,,,,,


In [10]:
X = df[['RouteID', 'StopID', 'WeekBeginning']]
X['WeekBeginning'] = X['WeekBeginning'].apply(lambda d: datetime.datetime.strptime(d, "%Y-%m-%d %H:%M:%S").timestamp())
X = X.values
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


array([['100', '14156', 1372543200.0],
       ['100', '14144', 1372543200.0],
       ['100', '14132', 1372543200.0],
       ...,
       ['500', '11084', 1382824800.0],
       ['500', '10902', 1382824800.0],
       ['500', '13277', 1382824800.0]], dtype=object)

In [11]:
#X_scaler = StandardScaler().fit_transform(X)
X_scaler = X

In [12]:
df['NumberOfBoardings'] = df['NumberOfBoardings'].apply(lambda v: float(v))
y = df['NumberOfBoardings'].values

# labels = ["None", "Low", "Mid", "High", "Higher", "Highest"]
(y, bins) = pd.cut(y, [0, 10, 50, 100, 500, 1000, float("inf")], labels=False, retbins=True)

In [13]:
Counter(y)

Counter({0: 5040406, 1: 420435, 2: 20190, 3: 2081, 4: 2})

In [14]:
bins

array([   0.,   10.,   50.,  100.,  500., 1000.,   inf])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y, test_size=0.2, random_state=10)

In [16]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
print("Logistic Regression: ", model_lr.score(X_test, y_test) * 100)



Logistic Regression:  91.90551356300205


In [17]:
from sklearn.externals import joblib

joblib.dump(model_lr, 'data/lr.joblib')

['data/lr.joblib']

In [13]:
from sklearn.naive_bayes import GaussianNB

model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)
print("GaussianNB: ", model_gnb.score(X_test, y_test) * 100)

GaussianNB:  91.7566930476563


In [16]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(hidden_layer_sizes=(1, 2), random_state=42)
model_mlp.fit(X_train, y_train)
print("MLPClassifier: ", model_mlp.score(X_test, y_test) * 100)

MLPClassifier:  91.90551356300205
