# Data Modeling

This notebook was created as part of a workshop on *Reproducible Research in Python*. 

- You can access the entire workshop materials at: [Reproducible Research in Python](https://github.com/mickaeltemporao/reproducible-research-in-python).

**Learning Objective:** 
- Learn create data pre-processing functions
- Learn how to train and save model objects
- Learn to load and make predictions on unseen data



## Preparing the training set

Let's try to forecast the election based on existing polls!



In [0]:
# Installing and Importing Packages
!pip install wikipedia
import wikipedia as wp 
import pandas as pd

In [0]:
# For the training part we will rely on polls from the 2015 election.
page_titles = [
    "Opinion polling for the 2015 Canadian federal election",
    "Opinion polling for the 2019 Canadian federal election",
]

html_pages = [wp.page(page).html().encode("UTF-8") for page in page_titles]
dfs = [pd.read_html(html)[0] for html in html_pages]



In [0]:
# Cleaning the training set.
import re


In [0]:
# A function to fix the column names

names_dict = {
    "polling_firm": "source",
    "last_dateof_polling": "date",
    "samplesize": "sample_size",
    "marginof_error": "error",
    "cons": "cpc",
    "liberal": "lpc",
    "green": "gpc",
    "polling_method": "method",
}

def fix_names(input_df, names_dict):
    """Renames the columns in the input dataframe."""
    regex = "[a-z]+"

    columnn_names = []

    tmp_df = input_df.copy()

    for c in tmp_df.columns:
        tmp = c.lower()
        columnn_names.append(tmp.replace(" ", "_"))

    tmp_names = ["_".join(re.findall(regex, i)) for i in columnn_names]
    tmp_df.columns = tmp_names

    return tmp_df.rename(columns=names_dict)


In [0]:
# Let's edit them...
df_train = fix_names(df_train, names_dict)
df_train.columns


In [0]:
# Let's keep relevant variables only

to_keep = [
    'source',
    'date',
    'lpc',
    'cpc',
    'ndp',
    'bq',
    'gpc',
    'method'
]

df_train = df_train[to_keep]


In [0]:
# What does the training set look like ?
df_train = df_train[to_keep]
df_train.head()


In [0]:
# Let's store and remove the election results
results_2015 = df_train.iloc[1]
df_train = df_train.drop(1).dropna()


In [0]:
# Let's deal with missing values
df_train.dropna(inplace=True)


In [0]:
# What about the data types?
df_train.select_dtypes(include='object')


In [0]:
# Let's fix that date variable
df_train['date'] = pd.to_datetime(df_train.date)
df_train.sample(3)


In [0]:
# As we mentioned, most algorithms require the data to be in long-format
parties = ["lpc", "cpc", "ndp", "bq", "gpc"]

df_train = pd.melt(
    df_train.reset_index(),
    id_vars=['date', 'source', 'method'],
    value_vars=parties,
    var_name='party',
    value_name='share',
)

df_train.head()


Let's do some more exploration and see if polls actually improve as we get closer to the election day?


In [0]:
# We need to merge the outcome of the election back
targets = (
    results_2015
    .transpose()
    .iloc[2:-1]
    .reset_index()
)

targets.columns = ['party', 'outcome']
targets['outcome'] = targets.outcome.astype('float')

df_train = df_train.merge(targets)
df_train.head()


In [0]:
# Does time have an impact on the error of pollsters?
df_train['error'] = abs(df_train.share - df_train.outcome)
df_train.set_index('date', inplace=True)
df_train.error.resample('D').mean().plot()


In [0]:
# What about the data collection method?
df_train.method.value_counts()


In [0]:
# Let's use some regex to do an initial cleaning
regex = r"\(.*\)|/| |rolling"
df_train['method'] = df_train.method.str.replace(regex, "")
df_train['method'].value_counts()


In [0]:
# Let's groups these even further
df_train['method'] = df_train.method.str.lower().str[:3]
df_train['method'].value_counts()


In [0]:
# Let's use seaborn this time as we now have a long-dataset and see see if there is an abservable difference between the data collection methods
import seaborn as sns
sns.violinplot(x="method", y="error",
               split=True, inner="quart",
               data=df_train)


## Preparing the test set


In [0]:
# Now that we have some intuition about 2015!
# We need to prepare our test set and verify it has the same form as the train set.
df = pd.read_csv("national_polls_2019.csv", parse_dates=['date'])


In [0]:
df_test = new_df.stack()
df_test.name = 'share'
df_test = df_test.reset_index().set_index('date')

data_2019 = {
    "party": ["lpc", "cpc", "bq", "ndp", "gpc"],
    "outcome": [33.1,34.4, 7.7, 15.9, 6.5],
}

df_test = df_test.reset_index().merge(pd.DataFrame(data_2019)).set_index('date')
df_test['error'] = abs(df_test.share - df_test.outcome)
all(df_test.columns == df_train.columns)


In [0]:
# Let's create a function to clean the method string!
def str_magic(input_series):
    regex = r"\(.*\)|/| |rolling"
    tmp = input_series.copy()
    tmp = df_test['method'].copy()
    tmp = tmp.str.replace(regex, "")
    return tmp.str.lower().str[:3]

df_test['method'] = str_magic(df_test['method'])


## Feature Creation


In [0]:
# We need to prepare our features
election_day_2015 = "2015-10-19"
election_day_2019 = "2019-10-21"

def add_days(df, election_day):
    test = pd.to_datetime(election_day) - df.reset_index()['date']
    test.index = df.index
    df['days'] = test.dt.days
    return df

df_train = add_days(df_train, election_day_2015)
df_test = add_days(df_test, election_day_2019)



In [0]:
# One-Hot Encoding
# Let's remove the group with most counts
df_train.method.value_counts().plot(kind='barh')


In [0]:
# Let's drop the most common value
train_dummies = pd.get_dummies(df_train['method'])
train_dummies.pop('tel')
df_train = pd.concat([df_train, train_dummies], axis=1)

test_dummies = pd.get_dummies(df_test['method'])
test_dummies.pop('tel')
df_test = pd.concat([df_test, test_dummies], axis=1)

y_var = 'outcome'
X_vars = ['share', 'days', 'ivr', 'onl']

predictions = []


## Model Training

In [0]:
# Now that we have our train and test sets let's train our models

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pickle

models = [
    LinearRegression(),
    RandomForestRegressor(),
]


In [0]:
# Fit, predict, and save your models
for i in range(2):
    models[i].fit(df_train[X_vars], df_train[y_var])
    predictions.append(models[i].predict(df_test[X_vars]))
    pickle.dump(models[i], open(f"model_{i}.pkl", 'wb'))

predictions[0]

In [0]:
# Load a saved model from disc and make a prediction
input_date = '2019-09-20'

file_name = "model_0.pkl"
loaded_model = pickle.load(open(file_name, 'rb'))

predictions = loaded_model.predict(df_test.loc[input_date,X_vars])
results = df_test.loc[input_date, [y_var] + ["party", "share"]].assign(model_0=predictions)
results['abs_e_poll'] = abs(results.outcome - results.share)
results['abs_e_model_0'] = abs(results.outcome - results.model_0)


In [0]:
# Did our model beat the polls? 
print(results.loc[:,results.columns.str.contains('abs_e')].sum())


In [0]:
# Bonus - Packaging
## > Let's go to your terminal!