In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime, timedelta
import math

In [48]:
def generate_dataframe():
    df = pd.read_csv("final_dataset.csv")
    df = df.loc[df['Violation Precinct'] < 124]
    
    violation_precincts = df["Violation Precinct"].unique()
    
    print(len(violation_precincts))
    
    final_df = None
    
    i = 0
    
    total_rmse = 0
    
    for precinct in violation_precincts:
        
        new_dataset, RMSE = read_data(precinct, i)
        new_dataset[precinct] = new_dataset["prediction"]
        new_dataset = new_dataset[precinct]
        print("dataset", i, "read")
        
        total_rmse += RMSE
        
        if i == 0:
            final_df = new_dataset
        else:
            final_df = pd.merge(final_df, new_dataset, how = 'left', left_index=True, right_index=True)
        
        print("dataset", i, "merged")
        
        i += 1
    print("Avg RMSE", total_rmse / len(violation_precincts))
            
    return final_df

In [65]:
def read_data(precinct, i):
    df = pd.read_csv("final_dataset.csv")
    dataset = df.loc[df['Violation Precinct'] == precinct]
    dataset = dataset[(dataset['year'] < 2017)]
    dataset["Issue Date"] = pd.to_datetime(dataset["Issue Date"])

    dataset = dataset.set_index('Issue Date')
    features = ['month', 'day', 'dayofweek', 'year']
    target = ['count']
    
    
    temp = 100
    if dataset.shape[0] < 100:
        temp = 10
        

    new_test, RMSE = anomaly_prediction(dataset[features], dataset[target], dataset.iloc[-temp].name, 100, features, i)
    new_test = new_test.set_index('Date')
    
    return new_test, RMSE

In [50]:
def train_and_predict(X_train, y_train, X_test, output_df, i, lr = 0.01, n_estimators = 1000):
    reg = GradientBoostingRegressor(learning_rate = lr, n_estimators = n_estimators, min_samples_split = 10, min_samples_leaf = 10)
    reg.fit(X_train, y_train.values.ravel())
    
    temp = reg.predict(X_test)
    output_df['prediction'] = temp
    print("dataset", i, "trained")
    return output_df, temp

In [59]:
def anomaly_prediction(X_train, y_train, current_day, num_days, features, i):
    date_range = pd.date_range(current_day - timedelta(days = 10), current_day + timedelta(days=num_days))
    new_test = date_range.to_frame(index = False, name = "Date")
    
    new_test['dayofweek'] = new_test.Date.dt.dayofweek
    new_test['day'] = new_test.Date.dt.dayofyear
    new_test['month'] = new_test.Date.dt.month
    new_test['year'] = new_test.Date.dt.year
    
    X_test = new_test[features]
    new_test, predict = train_and_predict(X_train, y_train, X_test, new_test, i)
    
    print()
    
    ground_truth = y_train.iloc[-110:]['count'].to_numpy()
#     ground_truth = ground_truth[1:]    
    predictions = new_test['prediction'].to_numpy()
    predictions = predictions[:len(ground_truth)]
    
    print(len(predictions), len(ground_truth))
    
    print("dataset", i, "predicted")
    
    MSE = np.square(np.subtract(ground_truth,predictions)).mean() 
 
    RMSE = math.sqrt(MSE)
    print("Root Mean Square Error:\n")
    print(RMSE)

    
    return new_test, RMSE

In [66]:
final_df = generate_dataframe()

124
dataset 0 trained

110 110
dataset 0 predicted
Root Mean Square Error:

7.935660656024058
dataset 0 read
dataset 0 merged
dataset 1 trained

110 110
dataset 1 predicted
Root Mean Square Error:

40.06037787283195
dataset 1 read
dataset 1 merged
dataset 2 trained

110 110
dataset 2 predicted
Root Mean Square Error:

2.036939796607211
dataset 2 read
dataset 2 merged
dataset 3 trained

110 110
dataset 3 predicted
Root Mean Square Error:

1.6452763227353175
dataset 3 read
dataset 3 merged
dataset 4 trained

110 110
dataset 4 predicted
Root Mean Square Error:

0.7907178085395788
dataset 4 read
dataset 4 merged
dataset 5 trained

110 110
dataset 5 predicted
Root Mean Square Error:

22.77987689860759
dataset 5 read
dataset 5 merged
dataset 6 trained

110 110
dataset 6 predicted
Root Mean Square Error:

32.65801908375152
dataset 6 read
dataset 6 merged
dataset 7 trained

110 110
dataset 7 predicted
Root Mean Square Error:

22.314988752866476
dataset 7 read
dataset 7 merged
dataset 8 trained

dataset 66 trained

110 110
dataset 66 predicted
Root Mean Square Error:

38.63712182597116
dataset 66 read
dataset 66 merged
dataset 67 trained

110 110
dataset 67 predicted
Root Mean Square Error:

23.073330048889172
dataset 67 read
dataset 67 merged
dataset 68 trained

110 110
dataset 68 predicted
Root Mean Square Error:

25.876525134165824
dataset 68 read
dataset 68 merged
dataset 69 trained

110 110
dataset 69 predicted
Root Mean Square Error:

18.767245153608926
dataset 69 read
dataset 69 merged
dataset 70 trained

110 110
dataset 70 predicted
Root Mean Square Error:

35.5284996892858
dataset 70 read
dataset 70 merged
dataset 71 trained

110 110
dataset 71 predicted
Root Mean Square Error:

26.7055365300065
dataset 71 read
dataset 71 merged
dataset 72 trained

110 110
dataset 72 predicted
Root Mean Square Error:

30.68156930895275
dataset 72 read
dataset 72 merged
dataset 73 trained

110 110
dataset 73 predicted
Root Mean Square Error:

21.83841003283159
dataset 73 read
dataset 7

In [None]:
temp_df = pd.read_csv("final_dataset.csv")
data_cringe = temp_df.loc[temp_df['Violation Precinct'] == 12]
# data_cringe = data_cringe.loc[data_cringe['year'] < 2017]
# data_cringe = data_cringe.loc[data_cringe['year'] >= 2016]
data_cringe = data_cringe.set_index('Issue Date') 



# ax = data_cringe.plot(y = "count")
new_df = final_df.plot(y = 31, figsize=(10,3))

In [35]:
final_df.to_csv("gradient.csv")