In [None]:
####
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from scipy.stats.mstats import winsorize
from sklearn.cluster import KMeans
from pandas import to_datetime
!pip install prophet
import prophet
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

In [None]:
## Read in Dataset as df, get country code dummies, and change date_time
df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test.csv')
df = pd.get_dummies(df, columns = ['country_code'])
pd.to_datetime(df['ofd_date'], infer_datetime_format=True) 

In [None]:
## Remove Outliers
for column in ['OFD', 'Slam', 'Earlies_Exp','Earlies_Rec','MNR_SNR_Exp','Rollover','Returns','R_Sideline','Sideline']:
  df[column] = winsorize(df[column], limits=(0.05, 0.05))

In [None]:
## Cluster + Encode based on Mean of MNR SNR Exp and Earlies Exp
def encode_and_clster(df, var1, var2, n_clusters):
  # Encode based on two means
  df_1 = pd.DataFrame(df.groupby(by=[df.station_code], as_index = False).mean())
  df_1 = df_1.drop(columns = [x for x in list(df.columns) if 'country_code' in x])
  # Encode based on two means
  me_var1 = df_1.groupby('station_code')[var1].mean()
  df_1.loc[:, 'en_var1'] = df_1['station_code'].map(me_var1)
  me_var2 = df_1.groupby('station_code')[var2].mean()
  df_1.loc[:, 'en_var2'] = df_1['station_code'].map(me_var2)
  # Create Sum of those two means
  df_1['mean_encode'] = df_1['en_var1'] + df_1['en_var2']

  # Create Cluster
  df_cluster = df_1[[ 'OFD', 'Slam','Earlies_Exp', 'Earlies_Rec', 'MNR_SNR_Exp', 'Rollover', 
                  'Returns', 'R_Sideline', 'Sideline', 'mean_encode']]
  kmean = KMeans(n_clusters=n_clusters, random_state=0).fit(df_cluster)
  # Map DC to New Clusters
  df_mapping = list(kmean.predict(df_cluster))
  dict = {}
  for i in range(0, len(df_mapping)):
    dict[df_1['station_code'][i]] = df_mapping[i]

    
  def conversion_function(value):
    return dict[value]
  
  df['DC'] = df['station_code'].apply(conversion_function)
  return df

df_new = encode_and_clster(df, 'OFD', 'Slam', 7)

In [None]:
### PROPHET MODEL

# Prepare Data. Get unique Datacenters and prepare df in a way Prophet can handle (target = yhat, date = ds)
codes = df_new['station_code'].unique()
pandas = pd.DataFrame(np.array([[0, 0, 0]]),columns=['ds', 'yhat', 'DC'])

# Define Function to run Prophet Model and return forecast
def run_prophet(var_to_predidct, train, test):
  # Select Var to Predict & Date
  train = train[[var_to_predidct, 'ofd_date']]
  # Change to datetime
  train['ds']= to_datetime(train['ofd_date'])
  train['y']= to_datetime(train[var_to_predidct])
  # Fit & Run Model
  model = Prophet(weekly_seasonality=True)
  model.fit( train)
  forecast = model.predict(test)
  # Return Forecast
  return forecast[['ds','yhat']]

## For each Data Center call prophet function

for DC in codes:
  # Filter Data for DataCenter
  df_new_test = df_new.loc[df_new['station_code'] == DC]
  test['ds'] = test_df['ofd_date']
  test = test_df.loc[test_df['station_code'] == DC][['ds']]
  # Run Prophet Model on Earliers Exp and MNR_SNR_EXP
  forecast_1 = run_prophet('Earlies_Exp', df_new_test, test)
  forecast_2 = run_prophet('MNR_SNR_Exp', df_new_test, test)
  # Create Forecast
  forecast_target = forecast_1['yhat']-forecast_2['yhat']
  forecast = pd.DataFrame(forecast_target)
  forecast['ds'] = forecast_1['ds']
  forecast['DC'] = DC
  # Add to final Data Frame
  pandas = pandas.append(forecast)


In [None]:
### Wrangle Data Frame so that it is in the format ready for submission
# Create two copies to avoid overwriting
panda = pandas
panda_New = panda
# Extract data variable in the way we need it
panda_New['ds'] = panda[panda['ds'].map(str)].str.split(' ',expand=True)[0]
# Change name of Target to Expected
panda['Expected'] = panda['yhat']
# Create final ID variable
panda['Id'] = panda_New['ds']
panda_final = panda[['Id', 'Expected', 'DC']]
panda_final['Id_new'] =panda_final["Id"]+ "_" + panda_final["DC"].map(str)
panda_final['Id'] = panda_final['Id_new']
# Build final DF
panda_final = panda_final[['Id', 'Expected']]

In [None]:
# Mount Drive and export
from google.colab import drive
drive.mount('drive/', force_remount=False)
panda_final.to_csv('data_prophet.csv')
!cp data_prophet.csv "drive/My Drive/"