In [None]:
def nClusters(n, dataset):
    
    # DESCRIPTION: 
    # Function that clusterizes some dataset.
    
    # INPUTS:
    # n (int): Number of clusters
    # dataset (pd Dataframe): Clients dataset after a PCA for dimensionality reduction
    # modelType (str): string indicating the choice of model
    
    # OUTPUTS:
    # model (object): the sklearn model of kmeans already "trained", used for
    # obtaining information
    # dataset (pd dataframe): the fitted dataset
    # clusters_df (pandas dataframe): the input dataframe but with cluster id aggregated
    # assign (pandas dataframe): the original first dataframe with original features,
    # but with cluster id aggregated
    
    model = KMeans(n_clusters = n, random_state = 42) # max iters is set to 300 by default
    model.fit(dataset)

    # Assign a cluster id to the original df.
    clusters_df = pd.concat([dataset, pd.Series(model.labels_)], axis = 1) # concatenate labels to pca df
    clusters_df.columns = np.concatenate((dataset.columns.values, ['CLUSTER ID']), axis = 0) #add the "cluster id" columna
    #clusters_df = pd.concat([CUST_ID, clusters_df], axis = 1) #add the "customer id" df at the begining.
    print(f'The model assigned theese values:')
        
    # Recovering previous features for discussion:
    #assign = pd.concat([client_data, pd.Series(model.labels_)], axis = 1) # concatenate labels to pca df
    #assign.columns  = np.concatenate((client_data.columns.values, ['CLUSTER ID']), axis = 0) #add the "cluster id" columna
    print(clusters_df['CLUSTER ID'].value_counts())
        
    return clusters_df, model
    
def elbobMethod(max_clusters):
    # Elbow method

    elbow_points = []
    #max_clusters = len(dataset_blood_scaled)
    n_clust = [i+2 for i in range(max_clusters)]

    for num in n_clust:
        kmeans = KMeans(n_clusters = num, random_state = 42)
        kmeans.fit(dataset_blood_scaled)
        elbow_points.append(kmeans.inertia_)

    plt.plot(n_clust, elbow_points, 'bo-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.show()
    
    
def getMeansFromCluster(assigned_df, n_clusters):
    
    # DESCRIPTION:
    # Function that outputs statistics of dataset with assigned cluster id
    # after some clusterization
    
    # INPUTS:
    # assigned_df (pandas dataframe): dataframe containinng the original
    # features before pca and a cluster id
    # n_clusters (int): number of clusters 
    
    # OUTPUTS:
    # statistics (pandas dataframe): dataframe containing the mean of
    # every feature grouped by cluster
    
    # HAY QUE CAMBIAR ESTO
    balance = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).BALANCE.mean())
    balance_frequency= pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).BALANCE_FREQUENCY.mean())
    purchases= pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).PURCHASES.mean())
    oneoff_purchases = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).ONEOFF_PURCHASES.mean())
    installments_purchases = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).INSTALLMENTS_PURCHASES.mean())
    cash_advance = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).CASH_ADVANCE.mean())
    purchase_freq = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).PURCHASES_FREQUENCY.mean())
    oneoff_purchase_freq = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).ONEOFF_PURCHASES_FREQUENCY.mean()) 
    purchases_installments_freq = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).PURCHASES_INSTALLMENTS_FREQUENCY.mean()) 
    cash_advance_freq = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).CASH_ADVANCE_FREQUENCY.mean()) 
    cash_advance_trx = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).CASH_ADVANCE_TRX.mean())
    purchases_trx = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).PURCHASES_TRX.mean())
    credit_limit = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).CREDIT_LIMIT.mean())
    payments = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).PAYMENTS.mean())
    min_paiments = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).MINIMUM_PAYMENTS.mean())
    pcr_full_paiments = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).PRC_FULL_PAYMENT.mean()) 
    tenure = pd.DataFrame(assigned_df.groupby(['CLUSTER ID']).TENURE.mean()) 

    statistics = pd.concat([pd.Series([i for i in range(n_clusters)]),balance,balance_frequency, purchases, oneoff_purchases, installments_purchases,cash_advance,purchase_freq,oneoff_purchase_freq,purchases_installments_freq,cash_advance_freq,cash_advance_trx,purchases_trx,credit_limit,payments,min_paiments,pcr_full_paiments,tenure], axis=1)
    statistics = statistics.rename(columns = {0:'CLUSTER ID'}) #correct the number of cluster id column
    return statistics



def plotNdimensions(n,n_clusters, dataset,model, model_type): 
    
    # DESCRIPTION:
    # Function that given a fitted dataset, plots the clustering result
    # in order to do this, we perform a PCA to reduce dimensionality.
    
    # INPUTS:
    # n (int): number of dimensions to plot (number of components of visualization PCA)
    # dataset (sklearn object): ALREADY fitted dataset of 6 components
    # model (sklearn object): model type, only used to get labels for plot
    # model_type (str): string indicating the model type,
    # only used for fancy automatic titles
    
    # OUTPUTS:
    # No outputs
    
    plt.rcParams['figure.figsize'] = [10, 7]
    
    # pca to lower dimension to n
    n_components = n
    pca_final = PCA(n_components = n_components)
    pca_final.fit(dataset)  
    
    # aux df for plotting
    visualization = pd.DataFrame(pca_final.fit_transform(dataset), columns = [str(i) for i in range(n)])
    visualization = pd.concat([visualization, pd.Series(model.labels_)], axis = 1) # add the labels
    visualization.columns = np.concatenate(([str(i+1) for i in range(n)],['CLUSTER ID']), axis = 0)   
    
    if n == 3:
        fig = plt.figure()
        ax = fig.add_subplot(projection='3d')
        x = visualization['1'].values.astype(float)
        y = visualization['2'].values.astype(float)
        z = visualization['3'].values.astype(float)

        ax.scatter(x,y,z, c=visualization["CLUSTER ID"], s=40) #, cmap="RdBu")
        
        plt.title(f'3D visualization for: {n_clusters} clusters, using: {model_type} clustering')
        ax.set_yticklabels([])
        ax.set_xticklabels([])
        ax.set_zticklabels([])
        
        ax.set_xlabel('PC1', fontweight ='bold')
        ax.set_ylabel('PC2', fontweight ='bold')
        ax.set_zlabel('PC3', fontweight ='bold')
      
        plt.show()
    
    elif n == 2:
        
        sns.scatterplot(x='1',y='2',hue='CLUSTER ID',legend='full',data=visualization).set(title=f'2D visualization for: {n_clusters} clusters, using: {model_type} clustering')

    else:
        print("The human brain can only interpret 2 or 3 dimensions")