In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import os
import seaborn as sns
from scipy import stats
import locale
locale.setlocale(locale.LC_ALL, '')

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


#Functions we created:
from modules import dem_fx
from modules import transaction_fx as trns
from modules import plot_functions as plt_fx
from modules import ml_functions as ml_fx

os.getcwd()

#Allows reload of modules:
%load_ext autoreload
%autoreload

In [None]:
hh_demographic_fxd = pd.read_csv("saved_structures/hh_demographic_fix_hhcomp.csv", sep = '\t')
hh_demographic_fxd.head()

In [None]:
updated_prod = pd.read_csv("saved_structures/updated_prod.csv", sep = '\t')
updated_prod.head()

In [None]:
trans_clean = pd.read_csv("saved_structures/trans_clean.csv", sep = '\t')
trans_clean.head()

In [None]:
participation_per_hh = pd.read_csv("saved_structures/participation_per_hh.csv", sep = '\t')

In [None]:
weekly_cart_df = pd.read_csv("saved_structures/weekly_cart_df.csv", sep = '\t')
weekly_cart_df.head()

In [None]:
weekly_cart_df.sort_values(by="household_key", inplace=True)
weekly_cart_np = weekly_cart_df.to_numpy()[:, 1:]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(weekly_cart_np)
weekly_cart_np_pca = pca.transform(weekly_cart_np)

print(weekly_cart_np_pca.shape)
plt.scatter(weekly_cart_np_pca[:,0], weekly_cart_np_pca[:,1])

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0, algorithm="elkan").fit(weekly_cart_np)
print(kmeans.labels_)

In [None]:
plt.scatter(weekly_cart_np_pca[:,0], weekly_cart_np_pca[:,1], c=kmeans.labels_)

In [None]:
hh_key = [i for i in range(len(kmeans.labels_))]

hh_to_clust = pd.DataFrame(np.array([hh_key, kmeans.labels_]).T, index=None, columns=["household_key", "clust"], dtype=int)

hh_demographic_clust =\
    hh_demographic_fxd.join(hh_to_clust, on="household_key", lsuffix="_clust").drop("household_key_clust", axis=1)
hh_demographic_clust.head()

In [None]:
def create_weekly_dep_df(trans_clean, participation_per_hh):
    
    grouped_per_dep = pd.DataFrame(trans_clean.groupby(['DEPARTMENT','household_key']).sum())
    index = trans_clean['household_key'].sort_values().unique()

    weekly_dep_df = pd.DataFrame(index = index)
    weekly_dep_df.index.name = 'household_key'

    for dep in trans_clean['DEPARTMENT'].unique(): 
        data = [grouped_per_dep.loc[dep, i]['QUANTITY']/(participation_per_hh['participation_length'][i])\
                for i in grouped_per_dep.loc[dep].index]
        
        intermediary_df = pd.DataFrame(index = grouped_per_dep.loc[dep].index, data = {dep +'_QUANT': data})

        weekly_dep_df = weekly_dep_df.join(intermediary_df)

    #Fill NaN values with 0.0:
    weekly_dep_df = weekly_dep_df.fillna(0.0)
    
    return weekly_dep_df

In [None]:
weekly_dep_df = create_weekly_dep_df(trans_clean, participation_per_hh.set_index("household_key"))

In [None]:
weekly_dep_df.head()

In [None]:
weekly_dep_np = weekly_dep_df.to_numpy()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(weekly_dep_np)
weekly_dep_np_pca = pca.transform(weekly_dep_np)

print(weekly_dep_np_pca.shape)
plt.scatter(weekly_dep_np_pca[:,0], weekly_dep_np_pca[:,1])