# Load Libraries

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
# SET VALIDATE = TRUE TO COMPUTE VALIDATION. 
# AND SET VALIDATE = FALSE TO MAKE AN SUBMISSION TO LB
VALIDATE = False

# USE 1 TO VALIDATE WITH LAST MONTH OF TRAIN, 2 FOR SECOND TO LAST ETC ETC
# NEW TRAIN DATA IS EVERYTHING BEFORE THIS
VAL_MONTH = 1

In [None]:
train = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv')
train["first_day_of_month"] = pd.to_datetime(train["first_day_of_month"])
train = train.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)
print(train.shape)
train.head()

In [None]:
trn_months = train.first_day_of_month.values[-39:-1*VAL_MONTH]
val_months = train.first_day_of_month.values[-1*VAL_MONTH:]
if VAL_MONTH != 1: 
    val_months = train.first_day_of_month.values[-1*VAL_MONTH:-1*VAL_MONTH+1]

if VALIDATE:
    test = train.loc[train.first_day_of_month.isin(val_months)]
    train = train.loc[train.first_day_of_month.isin(trn_months)]
print( train.shape )
train.head()

In [None]:
if not VALIDATE:
    test = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv')
    test["first_day_of_month"] = pd.to_datetime(test["first_day_of_month"])
    
test = test.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)
print(test.shape)
test.head()

In [None]:
train.cfips.value_counts()

In [None]:
test.cfips.value_counts()

In [None]:
TRAIN_SZ = len(train)//3135
TEST_SZ = len(test)//3135

print(TRAIN_SZ)
print(TEST_SZ)

In [None]:
train.columns

In [None]:
test.isnull().sum()

In [None]:
DISPLAY = 8
THRESHOLD = 8

IDS = train.cfips.unique()
x_train = np.arange(39).reshape((-1,1))
x_test = np.arange(38,47).reshape((-1,1))
for i in range(DISPLAY):
    c = np.random.choice(IDS)
    df = train.loc[train.cfips==c]
    last = df.microbusiness_density.values[-1]
    
    model = LinearRegression()
    model.fit(x_train,df.microbusiness_density)
    p = model.predict(x_train)
    
    # COMPUTE TRAIN ERROR
    err = p - df.microbusiness_density.values
    rng = df.microbusiness_density.max() - df.microbusiness_density.min()
    
    # DETERMIN IF TIME SERIES IS LINEAR OR NOT
    s = 0
    for k in range(39):
        e = np.abs( err[k] )
        r = e/(rng/2)
        s += r
        
    # INFER TEST DATA WITH LINEAR REGRESSION
    p2 = model.predict(x_test)
    shift =  last - p2[0]
    if s<THRESHOLD: preds = p2[1:]+shift
    else: preds = [last]*8
        
    # PLOT 
    plt.figure(figsize=(20,5))
    plt.plot(df.first_day_of_month,df.microbusiness_density,'-o',label='train data')
    plt.plot(df.first_day_of_month,p,'--',label='linear regression')
    plt.plot(test.first_day_of_month.values[:8],preds,'-o',label='test pred')
    pre = ''; post=''
    if s>THRESHOLD: 
        pre='NO, we WILL NOT USE linear regression for\n'
        post=' (We will predict last train value)'
    else: 
        pre='YES, we WILL USE linear regression for\n'
    plt.title(f'{pre}CFIPS {c}{post}',size=18)
    plt.xlabel('Date',size=16)
    plt.ylabel('Microbusiness Density',size=16)
    plt.legend()
    
    plt.hist(err,bins=20,label='error')
    plt.plot([-rng/2,-rng/2],[0,10],'--',color='black',label='range')
    plt.plot([rng/2,rng/2],[0,10],'--',color='black')
    plt.xlim((-rng * 0.75,rng * 0.75))
    plt.legend()
    plt.title(f'Linear Regression\nTrain Error vs. Train Range. (avg={100*s/78:2.1f}%)',size=18)
    plt.show()
    print('\n\n\n\n\n\n')

In [None]:
DISPLAY = 8

# ERROR THRESHOLD AS PERCENTAGE = THRESHOLD / 78
THRESHOLD = 8
ACTIVE_THRESHOLD = 2000

ids = train.cfips.unique()

x_train = np.arange(TRAIN_SZ).reshape((-1,1))
x_test = np.arange(TRAIN_SZ-1,TRAIN_SZ+TEST_SZ).reshape((-1,1))

linear_preds = np.zeros((len(ids), TEST_SZ))
last_preds = np.zeros((len(ids), TEST_SZ))
seasonal_preds = np.zeros((len(ids), TEST_SZ))

sn_trend = 0
lin_trend = 0
ct=0
for i, c in enumerate(ids):
    df = train.loc[train.cfips == c]
    
    last = df.microbusiness_density.values[-1]
    active = df.active.values[-1]
    
    last_preds[i,] = [last]*TEST_SZ
    WIDTH1 = 5; WIDTH2 = 7; WIDTH3 = 7
    
    x0a = df.microbusiness_density.values[-1-WIDTH1:-1]
    x0 = np.median(x0a)

    x1 = df.microbusiness_density.values[-12-1+1]
    x2a = df.microbusiness_density.values[-12-1-WIDTH2+1:-12-1+1]
    x2 = np.median(x2a)

    x3 = df.microbusiness_density.values[-24-1+1]
    x4a = df.microbusiness_density.values[-24-1-WIDTH3+1:-24-1+1]
    x4 = np.median(x4a)

    p = last
    if active >= ACTIVE_THRESHOLD:
        if (x1 > x2) and (x3 > x4) and (last > x0):
            p *= 1.005
        elif (x1 < x2) and (x3 < x4) and (last < x0):
            p *= 0.995
    seasonal_preds[i,] = [p]*TEST_SZ
    
    model = LinearRegression()
    model.fit(x_train, df.microbusiness_density)
    p = model.predict(x_train)
    err = p - df.microbusiness_density.values
    rng = df.microbusiness_density.max() - df.microbusiness_density.min()

    s = 0
    for k in range(TRAIN_SZ):
        e = np.abs(err[k])
        r = e / (rng/2)
        s += r
    if (s > THRESHOLD) or (active < ACTIVE_THRESHOLD):

        linear_preds[i,] = [last]*TEST_SZ
    else:
        p2 = model.predict(x_test)
        shift = last - p2[0]
        linear_preds[i,] = p2[1:] + shift
        lin_trend += 1
    if seasonal_preds[i,][0] == last:
        continue
    sn_trend += 1
    sn_trend += 1
    if ct>=DISPLAY+1: continue

final_preds = (linear_preds + last_preds + seasonal_preds) / 3

In [None]:
nan

In [None]:
submission = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv')
submission.head()

In [None]:
for i, c in enumerate(ids):
    df = test.loc[test.cfips == c]
    if df.shape[0] == 0:
        continue
    submission.microbusiness_density[submission['row_id'].isin(df['row_id'])] = final_preds[i,]


In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
nan