# $\Omega$ Pandas

> `pandas` is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, real world data analysis in Python. Additionally, it has the broader goal of becoming the most powerful and flexible open source data analysis / manipulation tool available in any language. It is already well on its way toward this goal.

> `pandas` is well suited for many different kinds of data:

> * Tabular data with heterogeneously-typed columns, as in an SQL table or Excel spreadsheet
* Ordered and unordered (not necessarily fixed-frequency) time series data.
* Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels
* Any other form of observational / statistical data sets. The data actually need not be labeled at all to be placed into a pandas data structure

For more tutorials, visit: https://pandas.pydata.org/pandas-docs/stable/tutorials.html

In [141]:
import pandas as pd
import numpy as np

In [142]:
# prepped_df = pd.read_csv('prepped.csv')
subway_df = pd.read_csv('subway.csv')

In [143]:
subway_df.head()

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
0,2017-05-01,00:18,Monday,KIPLING STATION (ENTER,TUSC,0,0,W,BD,5251
1,2017-05-01,00:58,Monday,ISLINGTON STATION,MUSC,0,0,W,BD,5182
2,2017-05-01,01:07,Monday,UNION STATION (DOWNSVI,MUPAA,3,7,N,YU,5811
3,2017-05-01,01:18,Monday,MCCOWAN STATION,MRTO,4,9,S,SRT,3006
4,2017-05-01,01:42,Monday,CASTLE FRANK STATION,MUO,7,11,W,BD,5296


In [144]:
subway_df['Code'].unique()

array(['TUSC', 'MUSC', 'MUPAA', 'MRTO', 'MUO', 'EUBK', 'MUI', 'EUDO',
       'EUNT', 'MUDD', 'TRTC', 'EUVA', 'TUMVS', 'MUPR1', 'PUSO', 'MUIE',
       'MUIR', 'MRO', 'PUMEL', 'MUIS', 'MUPLB', 'EUPI', 'MUNCA', 'PUMST',
       'SUDP', 'SUCOL', 'MUIRS', 'SRDP', 'PUSTC', 'MUGD', 'SUO', 'PUMO',
       'ERDO', 'EUCD', 'PRSW', 'PUCSS', 'TUNIP', 'EUAL', 'MUD', 'MUSAN',
       'PUTDN', 'PUTIJ', 'SUSA', 'SUAP', 'PUSTS', 'ERTC', 'EUYRD',
       'TRNCA', 'PUSRA', 'MRCL', 'SUUT', 'PUSSW', 'TUSUP', 'TUOS', 'MUTD',
       'EUTRD', 'PUTOE', 'TUNOA', 'PUTO', 'TUO', 'TUS', 'PRW', 'MRSTM',
       'TUKEY', 'SUSP', 'SUG', 'EUAC', 'PUTWZ', 'EUOPO', 'EUO', 'PUOPO',
       'EUTM', 'MUTO', 'PUSI', 'EUBO', 'SUBT', 'SUROB', 'MRUIR', 'TRO',
       'MUPLC', 'SUAE', 'PUTR', 'MUNOA', 'TUCC', 'EUOE', 'PUSNT', 'MRD',
       'SUEAS', 'MRWEA', 'MRUI', 'MUCL', 'ERAC', 'PUCSC', 'PUSCR', 'ERHV',
       'MRPAA', 'PRO', 'SRO', 'MUWR', 'EUME', 'EUNEA', 'SRUT', 'SUPOL',
       'ERPR', 'PUTNT', 'XXXXX', 'EUSC', 'EUTR', 'ERTR', '

In [145]:
subway_df['Line'].unique()

array(['BD', 'YU', 'SRT', 'SHP', 'YU/BD', 'YU / BD', 'YU BD', nan,
       'BD/YU', 'B/D', 'YU/ BD', 'YU - BD', '11 BAYVIEW', 'YU - BD LINE',
       'BD LINE', '85 SHEPPARD EAST', 'YU LINE', 'YU-BD',
       'BLOOR DANFORTH LINES', '999', '16 MCCOWAN', 'YUS',
       'YONGE UNIVERSITY SERVI'], dtype=object)

In [146]:
subway_df2 = subway_df.drop(columns=['Line'])

In [147]:
subway_df2['Date'] = pd.to_datetime(subway_df2['Date'])
subway_df2['year'] = subway_df2['Date'].dt.year
subway_df2['month'] = subway_df2['Date'].dt.month
subway_df2['dayofweek'] = subway_df2['Date'].dt.dayofweek

subway_df2 = subway_df2.drop(columns=['Date'])
subway_df2 = subway_df2.drop(columns=['Time'])

In [148]:
subway_df2.head()

Unnamed: 0,Day,Station,Code,Min Delay,Min Gap,Bound,Vehicle,year,month,dayofweek
0,Monday,KIPLING STATION (ENTER,TUSC,0,0,W,5251,2017,5,0
1,Monday,ISLINGTON STATION,MUSC,0,0,W,5182,2017,5,0
2,Monday,UNION STATION (DOWNSVI,MUPAA,3,7,N,5811,2017,5,0
3,Monday,MCCOWAN STATION,MRTO,4,9,S,3006,2017,5,0
4,Monday,CASTLE FRANK STATION,MUO,7,11,W,5296,2017,5,0


In [149]:
subway_df2 = pd.concat([subway_df2.drop('Code', axis=1), pd.get_dummies(subway_df2['Code'])], axis=1)
subway_df2 = pd.concat([subway_df2.drop('Day', axis=1), pd.get_dummies(subway_df2['Day'])], axis=1)
subway_df2 = pd.concat([subway_df2.drop('Bound', axis=1), pd.get_dummies(subway_df2['Bound'])], axis=1)
subway_df2 = pd.concat([subway_df2.drop('Station', axis=1), pd.get_dummies(subway_df2['Station'])], axis=1)

In [151]:
# subway_df2.head()
# subway_df2['Time'] = subway_df2['Time'].to_string()

In [152]:
from sklearn.model_selection import train_test_split

subway_df2['label'] = (subway_df2['Min Delay'] >= 5).astype(int)

X = subway_df2.drop(columns=['Min Delay', 'label'])

y = subway_df2['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018)

In [153]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=2018)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=2018, verbose=0,
            warm_start=False)

In [155]:
from sklearn.metrics import roc_auc_score

clf_proba = clf.predict_proba(X_test)[:, -1]

roc_auc_score(y_test, clf_proba)

0.9598950989189421

In [None]:
# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=2018)
# X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())