In [None]:
####
from sklearn.svm import SVR
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from scipy.stats.mstats import winsorize
from sklearn.cluster import KMeans
from pandas import to_datetime
#!pip install fbprophet
from fbprophet import Prophet
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
## Read in Dataset as df, get country code dummies, and change date_time

## Load Data Sets
train = pd.read_csv('train_data.csv')
pd.to_datetime(train['ofd_date'], infer_datetime_format=True) 
train['y'] = train['Earlies_Exp'] - train['MNR_SNR_Exp']
test = pd.read_csv('test.csv')
pd.to_datetime(test['ofd_date'], infer_datetime_format=True) 
test['y'] = 0

## Only choose columns from test in train
train = train[test.columns[1:]]

#Remove Outliers Training Set
for column in ['y', 'OFD', 'Slam','Earlies_Rec','Rollover','Returns','R_Sideline','Sideline']:
  train[column] = winsorize(train[column], limits=(0.05, 0.05))

## Add test to training data for processing
train = train.append(test)
#train

In [None]:
# Reset Index and Sort Data Set
train = train.sort_values(by = ['ofd_date', 'station_code'])
train = train.reset_index(drop = True)
train = train.drop(['Unnamed: 0'], axis = 1)
#train

In [None]:
# Create Month, Day, Weekday Variables from Date
train['Id'] =train["ofd_date"]+ "_" + train["station_code"].map(str)
train.ofd_date = pd.to_datetime(train.ofd_date)
month = train['ofd_date'].dt.month
day = train['ofd_date'].dt.day
weekday = train['ofd_date'].dt.weekday
train_dates = train.copy()
train_dates['day'] = day
train_dates['weekday'] = weekday
train_dates['month'] = month
train_dates.set_index('Id', inplace = True)
#train_dates

In [None]:
# Drop Date and Fc Codes
df = train_dates.drop(['ofd_date', 'fc_codes'], axis = 1)
#df

In [None]:
## Create Lags for all the variables
def create_lags(data, no_lags):
  for i in range(1, int(no_lags))):
    data["lag_{}_Earlies_Rec".format(i)] = data.Earlies_Rec.shift(i)
    data["lag_{}_OFD".format(i)] = data.OFD.shift(i)
    data["lag_{}_Slam".format(i)] = data.Slam.shift(i)
    data["lag_{}_Rollover".format(i)] = data.Rollover.shift(i)
    data["lag_{}_Returns".format(i)] = data.Returns.shift(i)
    data["lag_{}_Slideline".format(i)] = data.Sideline.shift(i)
  return data

df_lags = create_lags(df, 61)
#df_lags

In [None]:
## Encoding Country Codes and making DC numerical, creating new features
df_encoded = pd.get_dummies(df_lags, columns = ['country_code'])
df_encoded['DC'] = df_encoded['station_code'].str[1:]
df_encoded = df_encoded.drop(['station_code'], axis = 1)
df_encoded['DC'] = pd.to_numeric(df_encoded['DC'])
#df_encoded

feature_transform = df_encoded.columns.to_list()
for feature in feature_transform:
  df_encoded[f'{feature}_sqrd'] = df_encoded[feature].pow(2)
  df_encoded[f'{feature}_p3'] = df_encoded[feature].pow(3)
#df_encoded.shape

(13698, 1128)

In [None]:
## Create a correaltion Matrix
corr = df_encoded.corr()

In [None]:
## Filtering strong correlation
strong_var_pos = corr.loc[corr['y'] > 0].index.to_list()
strong_var_neg = corr.loc[corr['y'] < 0].index.to_list()
strong_var_pos.extend(strong_var_neg)

In [None]:
## Only select the strong correlated variables + Days & DC
df_strong =  df_encoded[[c for c in df_encoded.columns if c in strong_var_pos]]
#df_strong.shape

In [None]:
## Create the Three batches
df_train = df_strong.loc[df_strong['month'] != 7]
df_train = df_train.loc[df_strong['month'] != 2]
f_train = df_train.loc[df_strong['month'] != 6]
df_train = df_train.loc[df_strong['month'] != 3]
df_test = df_strong.loc[df_strong['month'] == 6]

In [None]:
## Create the X and Y for the threee batches
y_train = df_train['y']
y_test = df_test['y']
X_train = df_train.drop(['y'], axis = 1)
X_test = df_test.drop(['y'], axis = 1)

In [None]:
## Run and Test the Model
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import PowerTransformer

## Define K-Fold
cv = RepeatedKFold(n_splits=20, n_repeats=10, random_state=1)
alphas = [1]

## Run Model, with and without scaling
model = Lasso(alpha = 0.999999999)
#scaler = MinMaxScaler()
#model = make_pipeline(scaler, LinearRegression())
model.fit(X_train, y_train)

## Make Predicition
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

In [None]:
## Create Expected & ID Column
# Create df with Predictions
df_EXP = pd.DataFrame(y_pred, columns = ['Expected'])
# Create DF with IDs
df_test_ID = df_test[['y']]
df_test_ID['Id'] = df_test_ID.index
df_test_ID['Id'] = df_test_ID[['Id']]
df_test_ID = df_test_ID.reset_index(drop=True)

# Create Final DF that joins predictions & ID
final=pd.concat([df_test_ID,df_EXP],axis=1,ignore_index=True)
final = final[[1,2]]
final

In [None]:
# Export Data for submission
from google.colab import drive
drive.mount('drive/', force_remount=False)
final.to_csv('linear_squared.csv')
!cp linear_squared.csv "drive/My Drive/"

Mounted at drive/
