### CRISP - DM Method
- Barry - Business Understanding
- Drove - Data Understanding
- Directly to the - Data Prep
- Medical - Modelling
- Emergency  - Evaluation
- Department - mDeployment 

# 1.  Business Understanding
- Forecasting transactions
- Likely regression
- Data for 3 years
- Advised data quality is okay

# 2. Data Understanding

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('regression.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
for col in df.columns:
    print(col, len(df[col].unique()), df[col].unique())

In [None]:
df.describe()

In [None]:
df.dtypes

## Visualize Data

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(x='Account Type', y='Amount', data=df).set_title('Account Type ViolinPlot')
plt.show()

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(x='Account', y='Amount', data=df[df['Account Type']=='Liability']).set_title('Liability ViolinPlot')
plt.show()

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(x='Account Description', y='Amount', data=df[df['Account Type']=='Revenue']).set_title('Liability ViolinPlot')
plt.show()

## Review Trends

In [None]:
df.head()

In [None]:
monthmap = {
    'Jan':1,
    'Feb':2,
    'Mar':3,
    'Apr':4,
    'May':5,
    'Jun':6,
    'Jul':7,
    'Aug':8,
    'Sep':9,
    'Oct':10,
    'Nov':11,
    'Dec':12,
}

In [None]:
monthmap['Jan']

In [None]:
df['Period'] = df['Month'].apply(lambda x: monthmap[x])

In [None]:
df['Day'] = 1

In [None]:
df['Date'] = df['Year'].astype(str) + '-' + df['Period'].astype(str) + '-' + df['Day'].astype(str)

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df.dtypes

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(x='Date', y='Amount', hue='Account Description', estimator=None, data=df[df['Account Type']=='Revenue']).set_title('Seasonal Sales')
plt.show()

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(x='Date', y='Amount', hue='Account Description', estimator=None, data=df[df['Account Description']=='Product Sales']).set_title('Seasonal Sales')
plt.show()

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(x='Date', y='Amount', hue='Account Description', estimator=None, data=df[df['Account Description']=='Service Revenue']).set_title('Seasonal Sales')
plt.show()

## Correlation

In [None]:
df.corr()

In [None]:
df['Account Description'].unique()

In [None]:
pd.get_dummies(df['Account'])

In [None]:
corrdict = {}
for key, row in df.join(pd.get_dummies(df['Account'])).iterrows():
    corrdict[key] = {int(row['Account']):row['Amount']}

In [None]:
corrdf = pd.DataFrame.from_dict(corrdict).T.fillna(0) 

In [None]:
corrdf.corr()

In [None]:
plt.figure(figsize=(20,6))
sns.heatmap(corrdf.corr()).set_title('Account Correlation')
plt.show()

In [None]:
df[df['Account']==3000000]

In [None]:
df[df['Account']==4000001]

# 3. Data Preparation

In [None]:
import numpy as np

In [None]:
for account in df['Account'].unique():
    plt.figure(figsize=(20,6))
    sns.lineplot(x='Date', y='Amount', estimator=np.median, hue='Account Description', data=df[df['Account']==account]).set_title('{} by Month'.format(account))
    plt.show()

In [None]:
#df = df[df['Account']!=3000001]

In [None]:
df['Account'].unique()

## Convert Fields to Correct Data Type

In [None]:
df.dtypes

In [None]:
df['Account'] = 'ACC' + df['Account'].astype(str)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df['Year'] = df['Year'].astype(str)

In [None]:
df.dtypes

### Drop Analysis Fields

In [None]:
df.drop(['Period', 'Day', 'Date'], axis=1, inplace=True)

In [None]:
df.dtypes

In [None]:
len(df['Account'].unique())

In [None]:
len(df['Account Description'].unique())

In [None]:
df['AccountVal'] = df['Account'] + df['Account Description']

In [None]:
df.head()

In [None]:
len(df['AccountVal'].unique())

In [None]:
df.drop(['Account Description', 'AccountVal'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
pd.get_dummies(df)

In [None]:
df = pd.get_dummies(df)

In [None]:
df.head()

In [None]:
df.dtypes

# 4. Modelling

In [None]:
X = df.drop('Amount', axis=1)
y = df['Amount']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1234)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Import Dependencies

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
pipelines = {
    'rf':make_pipeline(RandomForestRegressor(random_state=1234)),
    'gb':make_pipeline(GradientBoostingRegressor(random_state=1234)),
    'ridge':make_pipeline(Ridge(random_state=1234)),
    'lasso':make_pipeline(Lasso(random_state=1234)),
    'enet':make_pipeline(ElasticNet(random_state=1234)),
}

In [None]:
RandomForestRegressor().get_params()

In [None]:
hypergrid = {
    'rf': {
        'randomforestregressor__min_samples_split':[2,4,6],
        'randomforestregressor__min_samples_leaf':[1,2,3]
    },
    'gb':{
        'gradientboostingregressor__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'ridge':{
        'ridge__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'lasso':{
        'lasso__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'enet':{
        'elasticnet__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    }
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import NotFittedError

In [None]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hypergrid[algo], cv=10, n_jobs=-1)
    try:
        print('Starting training for {}.'.format(algo))
        model.fit(X_train, y_train)
        fit_models[algo] = model
        print('{} has been successfully fit.'.format(algo))
    except NotFittedError as e:
        print(repr(e))

In [None]:
fit_models['ridge'].predict(X_test)

# 5. Evaluation

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
for algo,model in fit_models.items():
    yhat = model.predict(X_test)
    print('{} scores - R2:{} MAE:{}'.format(algo, r2_score(y_test, yhat), mean_absolute_error(y_test, yhat)))

In [None]:
best_model = fit_models['rf']

# 6. Deployment

In [None]:
creds = {
  "YOUR CREDS HERE"
}

# 6. Deployment

In [None]:
!pip install watson_machine_learning_client

In [None]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient

In [None]:
wml_client = WatsonMachineLearningAPIClient(creds)

In [None]:
# Specify Model Constants
MODEL_NAME = 'Transaction Forecast'
DEPLOYMENT_NAME = 'Transaction Forecast - Deployment'
BEST_MODEL = best_model

In [None]:
# Model Schema
model_props = {
    wml_client.repository.ModelMetaNames.NAME: "{}".format(MODEL_NAME)
}

In [None]:
published_model_details = wml_client.repository.store_model(model=BEST_MODEL, meta_props=model_props, training_data=X_train, training_target=y_train, pipeline=pipelines['rf'])

In [None]:
model_uid = wml_client.repository.get_model_uid(published_model_details)

In [None]:
model_uid

In [None]:
deployment = wml_client.deployments.create(artifact_uid=model_uid, name=DEPLOYMENT_NAME, asynchronous=False)

### Deployment Review

In [None]:
# Get deployment stuff
iam_token = wml_client.wml_token
ml_instance_id = creds['instance_id']
url = deployment['entity']['scoring_url']

In [None]:
deployment

In [None]:
X_test.columns.to_numpy().tolist()

In [None]:
X_test.iloc[0].to_numpy().tolist()

In [None]:
import urllib3, requests, json

In [None]:
header = {'Content-Type':'application/json', 'Authorization':'Bearer '+iam_token, 'ML-Instance-ID':ml_instance_id}
payload = {'fields':X_test.columns.to_numpy().tolist(), "values":X_test.to_numpy().tolist()}

In [None]:
payload

In [None]:
response_scoring = requests.post(url, json=payload, headers=header)

In [None]:
response_scoring

In [None]:
response_scoring.text

In [None]:
predictions = json.loads(response_scoring.text)

In [None]:
import numpy as np

In [None]:
pred_values = np.squeeze(predictions['values'])

In [None]:
pred_values

In [None]:
preddf = X_test.join(y_test)

In [None]:
preddf['Scores'] = pred_values

In [None]:
preddf.head()

In [None]:
preddf['Diff'] = preddf['Amount'] - preddf['Scores']

In [None]:
preddf.head()

In [None]:
preddf.to_csv('results.csv')

In [None]:
X_test[['Account Type_Asset',  'Account Type_Expense',  'Account Type_Liability',  'Account Type_Revenue']][X_test['Account_ACC3000000']==1].head()

In [None]:
X_test.iloc[0].to_numpy().tolist()
