In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

def filter_nyc(filename = 'nyc.csv'):
    #read_csv
    df = pd.read_csv(filename,header=0)
    #drop unnamed column
    df.drop(['Unnamed: 0'], axis = 1, inplace = True)
    #convert to date
    df['Created Date'] = pd.to_datetime(df['Created Date'], infer_datetime_format = True)
    df['Closed Date'] = pd.to_datetime(df['Closed Date'], infer_datetime_format = True)
    #extract month
    df['month'] = df['Created Date'].apply(lambda x: x.month)
    #extract hour
    df['hour'] = df['Created Date'].apply(lambda x: x.hour)
    #extract day
    df['weekday'] = df['Created Date'].apply(lambda x: x.dayofweek)
    #map agency num
    agency_num = {}
    for num, agency in enumerate((df['Agency'].unique())): 
        agency_num[agency] = num   
    df['agency_num'] = df['Agency'].apply(lambda x: agency_num[x])
    #map borough num
    d = {'MANHATTAN':1, 'BROOKLYN':2, 'QUEENS' : 3, 'STATEN ISLAND' : 4, 'BRONX' : 5}
    pat = '(' + '|'.join(d.keys()) + ')'
    df['borough_num'] = df['Borough'].str.extract(pat, expand=False).map(d).fillna(0, downcast='int')
    #drop borough == 0
    df = df[df.borough_num != 0]
    #create bucket column
    df['processing_time_bucket'] = df.processing_time.map( lambda x: 1 if x >= 1 else 0)
    #subset
    df = df[['hour','month', 'weekday', 'agency_num', 'borough_num', 'processing_time_bucket']]
    df.to_csv('filtered.csv',index=False)
    
def build_and_predict():
    data = pd.read_csv('filtered.csv')
    test = pd.read_csv('topredict.csv')
    #model
    model = LogisticRegression()
    #split
    '''from sklearn.model_selection import train_test_split
    #y = data['processing_time_bucket']
    #data.drop(columns = ['processing_time_bucket'], inplace = True)
    train, test = train_test_split(data, test_size = 0.3)
    x_train = train.iloc[0:,0:6]
    y_train = train.iloc[0:,5:6]
    x_test = test.iloc[0:,0:6]
    y_test = test['processing_time_bucket']
    y_train'''
    y_train = data['processing_time_bucket']
    x_train = data.iloc[0:, 0:5]
    #fit
    model.fit(x_train, y_train)
    #predict
    predictions = model.predict(test)
    predictions = pd.DataFrame(predictions, columns = ['prediction'])
    predictions.to_csv('predictions.csv', index_label = 'index')
   