In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
DIR_PATH = "../input/godaddy-microbusiness-density-forecasting/"

train = pd.read_csv(DIR_PATH + "train.csv")
test = pd.read_csv(DIR_PATH + "test.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122265 entries, 0 to 122264
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   row_id                 122265 non-null  object 
 1   cfips                  122265 non-null  int64  
 2   county                 122265 non-null  object 
 3   state                  122265 non-null  object 
 4   first_day_of_month     122265 non-null  object 
 5   microbusiness_density  122265 non-null  float64
 6   active                 122265 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 6.5+ MB


In [4]:
train.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243


In [5]:
train.cfips.nunique()

3135

In [6]:
train['first_day_of_month'] = pd.to_datetime(train['first_day_of_month'])
test['first_day_of_month'] = pd.to_datetime(test['first_day_of_month'])

In [7]:
THRESHOLD = 8
ACTIVE_THRESHOLD = 300
TRAIN_SIZE = len(train) // 3135
TEST_SIZE = len(test) // 3135
print("Number of siries in train: {}.".format(TRAIN_SIZE))
print("Number of siries in test: {}.".format(TEST_SIZE))

Number of siries in train: 39.
Number of siries in test: 8.


In [8]:
ids = train.cfips.unique()

x_train = np.arange(TRAIN_SIZE).reshape((-1, 1))
x_test = np.arange(TRAIN_SIZE-1, TRAIN_SIZE+TEST_SIZE).reshape((-1, 1))

linear_preds = np.zeros(len(ids))
last_preds = np.zeros(len(ids))
seasonal_preds = np.zeros(len(ids))

sn_trend = 0
lin_trend = 0
ct = 0
for i, c in enumerate(ids):
    df = train.loc[train.cfips == c]
    
    last = df.microbusiness_density.values[-1]
    active = df.active.values[-1]
    last_preds[i] = last
    
    model = LinearRegression()
    model.fit(x_train, df.microbusiness_density.values)
    prediction = model.predict(x_train)
    loss = prediction - df.microbusiness_density.values
    rng = df.microbusiness_density.max() - df.microbusiness_density.min()
    
    s = 0
    for k in range(TRAIN_SIZE):
        e = np.abs(loss[k])
        r = e / (rng/2)
        s += r
    if s > THRESHOLD or active < ACTIVE_THRESHOLD:
        linear_preds[i] = last
    else:
        test_prediction = model.predict(x_test)
        shift = last - test_prediction[0]
        linear_preds[i] = test_prediction[1] + shift
        lin_trend += 1
        
final_preds = (linear_preds + last_preds) / 2 

In [9]:
target = pd.DataFrame(data={"microbusiness_density": final_preds}, index=ids)

In [10]:
test = test.join(target, on='cfips')[['row_id', 'microbusiness_density']]

In [11]:
test.to_csv("submission.csv", index=False)
test.head()

Unnamed: 0,row_id,microbusiness_density
0,1001_2022-11-01,3.470458
1,1003_2022-11-01,8.359798
2,1005_2022-11-01,1.232074
3,1007_2022-11-01,1.28724
4,1009_2022-11-01,1.835877
