# Train Cluster Model

### Load all required modules including Oracle Connection and Data Processing Functions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import os 
import cx_Oracle
import statsmodels.api as sm
import joblib

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Update path to where function file resides
if os.name == 'nt':
    state = !cd
    
    # Load DB Connection File from Windows Machine
    os.chdir(r'Directory name')
    from db_connection import oracle_connection
    
    # Load function file from Windows Machine
    os.chdir(r'directory name')
    from general_functions import *
elif os.name == 'posix':
    state = !pwd
    
    # Load DB Connection File from Mac Machine
    os.chdir('directory name')
    from db_connection import oracle_connection
    
    # Load function file from Mac Machine
    os.chdir('directory name')
    from general_functions import *
else:
    print('No OS!')

#Change directory back to working Jupyter Notebook Directory after importing connection module
os.chdir(state[0])

## Create DB Connection String

In [None]:
if os.name == 'nt':
    # Update path to where config file resides
    db_creds = os.path.expanduser('~') + 'directory name'
    creds = oracle_connection(db_creds)

    url = creds['host'] + ":" + creds['port'] + "/" + creds['database']

    db = cx_Oracle.connect(creds['user'], creds['password'], url)

    cursor = db.cursor()
elif os.name == 'posix':
    # Update path to where config file resides
    db_creds = os.path.expanduser('~') + 'directory name'
    creds = oracle_connection(db_creds)

    url = creds['host'] + ":" + creds['port'] + "/" + creds['database']

    db = cx_Oracle.connect(creds['user'], creds['password'], url, encoding = 'UTF-8')
    cursor = db.cursor()
else:
    print('No OS!')

### Send query to Oracle database and return as Pandas DF

In [None]:
# Update schema to your schema
query = """
        """

df = pd.read_sql(query, cursor.connection)

In [None]:
df.head()

In [None]:
df_tr = df.copy()
df_tr = df_tr.drop(["column"], axis = 1)
df_tr.head()

### Data Pre-processing Steps

In [None]:
# Generate Dictionary that contains the column names as key and the corresponsing value to fill missing value in column as value
dtype_dict_value = replace_values(df_tr, char_value = 'Unknown')
df_tr.fillna(value = dtype_dict_value, inplace = True)

# Convert the categories into a cateogry level (aka Encode the Category value)
# Return covnerted DF and the dictionary that contains the key-value pair for columns category code mappings.
df_tr, forward_mapping_dict, inv_mapping_dict, encoder_fit = convert_cat_to_cat_lvl(df_tr, encode_method = 'Numeric')

df_tr[df_tr.isnull().any(axis = 1)]

%store forward_mapping_dict

### Split data into Train and Test dataframes

In [None]:
train_df, test_df = train_test_split(df_tr, test_size = 0.3, random_state = 5)
print(train_df.shape, test_df.shape)

### Evaluate the K-Means based on the number of clusters and cost associated with each respective cluster.
#### Look for inflection point in graph or cost between clusters in minimized

In [None]:
#Cluster the data (determine the optimal cluster based on cost)
k_list_full = []
cost_list_full = []

for k in range(1,16):
        
    kmeans_full = KMeans(n_clusters=k, random_state=0).fit(train_df)
    labels = kmeans_full.labels_
    inertia = kmeans_full.inertia_

    k_list_full.append(k)
    cost_list_full.append(inertia)
    
    print("K: ", k, "Cost: ", inertia)
    
k_df_full = pd.DataFrame(list(zip(k_list_full, cost_list_full)), columns = ['k', 'cost'])

In [None]:
k_df_full.plot(x = 'k', y = 'cost', figsize = (18, 16))

### Train K-Means Model

In [None]:
kmeans_full = KMeans(n_clusters = 6, random_state = 0).fit(train_df)

In [None]:
train_app_df = train_df.copy()
train_app_df['CLUSTER'] = kmeans_full.labels_
train_app_df.head()

In [None]:
test_app_df = test_df.copy()
test_app_df['CLUSTER'] = kmeans_full.predict(test_df)
test_app_df.head()

In [None]:
test_app_df['CLUSTER'].value_counts()

# Perform PCA to reduce the dimensionality of the dataset

In [None]:
df_std = df_tr.copy()

scaler_fit = StandardScaler().fit(df_std)
df_std = pd.DataFrame(scaler_fit.transform(df_std), columns = df_std.columns)

df_std.shape

In [None]:
pca = PCA(n_components=2)
pca_fit = pca.fit(df_std)
principalComponents = pca_fit.transform(df_std)
pca_df = pd.DataFrame(data = principalComponents
             , columns = ['PCA_1', 'PCA_2'])

### Split data into Train and Test dataframes

In [None]:
train_pca_df, test_pca_df = train_test_split(pca_df, test_size = 0.3, random_state = 5)
print(train_pca_df.shape, test_pca_df.shape)

### Evaluate the K-Means based on the number of clusters and cost associated with each respective cluster.
#### Look for inflection point in graph or cost between clusters in minimized

In [None]:
#Cluster the data (determine the optimal cluster based on cost) for the PCA reduced dataset
k_list_pca = []
cost_list_pca = []

for k in range(1,16):
        
    kmeans_pca = KMeans(n_clusters=k, random_state=0).fit(train_pca_df)
    labels = kmeans_pca.labels_
    inertia = kmeans_pca.inertia_

    k_list_pca.append(k)
    cost_list_pca.append(inertia)
    
    print("K: ", k, "Cost: ", inertia)
    
k_df_pca = pd.DataFrame(list(zip(k_list_pca, cost_list_pca)), columns = ['k', 'cost'])

In [None]:
k_df_pca.plot(x = 'k', y = 'cost', figsize = (18, 16))

In [None]:
kmeans_pca = KMeans(n_clusters = 6, random_state = 0).fit(train_pca_df)

In [None]:
train_app_pca_df = train_pca_df.copy()
train_app_pca_df['CLUSTER'] = kmeans_pca.labels_
train_app_pca_df.head()

In [None]:
test_app_pca_df = test_pca_df.copy()
test_app_pca_df['CLUSTER'] = kmeans_pca.predict(test_pca_df)
test_app_pca_df.head()

In [None]:
test_app_pca_df['CLUSTER'].value_counts()

In [None]:
ax = test_app_pca_df.plot(x = 'PCA_1', y = 'PCA_2', kind = 'scatter', c = 'CLUSTER', colormap = 'cool', figsize = (18, 16))
ax.set_facecolor = ("grey")

### Serialize the model to disk for future use and version control.
#### Update version if necessary

In [None]:
# Models
joblib.dump(kmeans_full, './Model/kmeans_dlm_full_v2.0.pkl')
joblib.dump(kmeans_pca, './Model/kmeans_dlm_pca_v2.0.pkl')

# Transformations
joblib.dump(scaler_fit, './Model/kmeans_dlm_scaler_v2.0.pkl')
joblib.dump(pca_fit, './Model/kmeans_dlm_pca_fit_v2.0.pkl')