In [67]:
import logging
from collections import defaultdict
import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# fmt: on

LOGGER = logging.getLogger(__name__)
logging.basicConfig(level=logging.WARN)


class PySankeyException(Exception):
    """ Generic PySankey Exception. """

    pass


class NullsInFrame(PySankeyException):
    pass


class LabelMismatch(PySankeyException):
    pass


def check_data_matches_labels(labels, data, side):
    if len(labels) > 0:
        if isinstance(data, list):
            data = set(data)
        if isinstance(data, pd.Series):
            data = set(data.unique().tolist())
        if isinstance(labels, list):
            labels = set(labels)
        if labels != data:
            msg = "\n"
            if len(labels) <= 20:
                msg = "Labels: " + ",".join(labels) + "\n"
            if len(data) < 20:
                msg += "Data: " + ",".join(data)
            raise LabelMismatch(
                "{0} labels and data do not match.{1}".format(side, msg)
            )


def sankeyPlot(
    left,
    right,
    leftWeight=None,
    rightWeight=None,
    colorDict=None,
    leftLabels=None,
    rightLabels=None,
    datatitle=0,
    aspect=4,
    rightColor=False,
    fontsize=25,
    figureName=None,
    closePlot=False,
    figSize=(17,17),
):
    """
    Make Sankey Diagram showing flow from left-->right
    Inputs:
        left = NumPy array of object labels on the left of the diagram
        right = NumPy array of corresponding labels on the right of the diagram
            len(right) == len(left)
        leftWeight = NumPy array of weights for each strip starting from the
            left of the diagram, if not specified 1 is assigned
        rightWeight = NumPy array of weights for each strip starting from the
            right of the diagram, if not specified the corresponding leftWeight
            is assigned
        colorDict = Dictionary of colors to use for each label
            {'label':'color'}
        leftLabels = order of the left labels in the diagram
        rightLabels = order of the right labels in the diagram
        aspect = vertical extent of the diagram in units of horizontal extent
        rightColor = If true, each strip in the diagram will be be colored
                    according to its left label
        figSize = tuple setting the width and height of the sankey diagram.
            Defaults to (6, 6)
    Ouput:
        None
    """
        
        
    if leftWeight is None:
        leftWeight = []
    if rightWeight is None:
        rightWeight = []
    if leftLabels is None:
        leftLabels = []
    if rightLabels is None:
        rightLabels = []
    # Check weights
    if len(leftWeight) == 0:
        leftWeight = np.ones(len(left))

    if len(rightWeight) == 0:
        rightWeight = leftWeight

    plt.figure()
    plt.rc("text", usetex=False)
    plt.rc("font", family="serif")

        
    # Create Dataframe
    if isinstance(left, pd.Series):
        left = left.reset_index(drop=True)
    if isinstance(right, pd.Series):
        right = right.reset_index(drop=True)

    
    dataFrame = pd.DataFrame(
        {
            "left": left,
            "right": right,
            "leftWeight": leftWeight,
            "rightWeight": rightWeight,
        },
        index=range(len(left)),
    )
    #print(dataFrame) ###################

    if len(dataFrame[(dataFrame.left.isnull()) | (dataFrame.right.isnull())]):
        raise NullsInFrame("Sankey graph does not support null values.")

    # Identify all labels that appear 'left' or 'right'
    allLabels = pd.Series(
        np.r_[dataFrame.left.unique(), dataFrame.right.unique()]
    ).unique()
    LOGGER.debug(f"Labels to handle : {allLabels}")

    # Identify left labels
    if len(leftLabels) == 0:
        leftLabels = pd.Series(dataFrame.left.unique()).unique()
    else:
        check_data_matches_labels(leftLabels, dataFrame["left"], "left")
    
    
    # Identify right labels
    if len(rightLabels) == 0:
        rightLabels = pd.Series(dataFrame.right.unique()).unique()
    else:
        check_data_matches_labels(rightLabels, dataFrame["right"], "right")

# ordening LABELs
    leftLabels.sort()
    rightLabels.sort()
    
    # If no colorDict given, make one
    if colorDict is None:
        colorDict = {}
        palette = "hls"
        colorPalette = sns.color_palette(palette, len(allLabels))
        for i, label in enumerate(allLabels):
            colorDict[label] = colorPalette[i]
    else:
        missing = [label for label in allLabels if label not in colorDict.keys()]
        if missing:
            msg = (
                "The colorDict parameter is missing values for the following labels : "
            )
            msg += "{}".format(", ".join(missing))
            raise ValueError(msg)
    LOGGER.debug(f"The colordict value are : {colorDict}")
    # Determine widths of individual strips
    ns_l = defaultdict()
    ns_r = defaultdict()
    for leftLabel in leftLabels:
        leftDict = {}
        rightDict = {}
        for rightLabel in rightLabels:
            leftDict[rightLabel] = dataFrame[
                (dataFrame.left == leftLabel) & (dataFrame.right == rightLabel)
            ].leftWeight.sum()
            rightDict[rightLabel] = dataFrame[
                (dataFrame.left == leftLabel) & (dataFrame.right == rightLabel)
            ].rightWeight.sum()
        ns_l[leftLabel] = leftDict
        ns_r[leftLabel] = rightDict
    
    # Determine positions of left label patches and total widths
    leftWidths = defaultdict()
    for i, leftLabel in enumerate(leftLabels):
        myD = {}
        myD["left"] = dataFrame[dataFrame.left == leftLabel].leftWeight.sum()
        if i == 0:
            myD["bottom"] = 0
            myD["top"] = myD["left"]
        else:
            myD["bottom"] = (
                leftWidths[leftLabels[i - 1]]["top"] + 0.02 * dataFrame.leftWeight.sum()
            )
            myD["top"] = myD["bottom"] + myD["left"]
            topEdge = myD["top"]
        leftWidths[leftLabel] = myD
        LOGGER.debug(f"Left position of '{leftLabel}' : {myD} ")
    left_max = max(myD.values())
        
    # Determine positions of right label patches and total widths
    rightWidths = defaultdict()
    for i, rightLabel in enumerate(rightLabels):
        LOGGER.debug(f"Handling {i}: {rightLabel}")
        myD = {}
        myD["right"] = dataFrame[dataFrame.right == rightLabel].rightWeight.sum()
        if i == 0:
            myD["bottom"] = 0
            myD["top"] = myD["right"]
        else:
            bottomWidth = rightWidths[rightLabels[i - 1]]["top"]
            # LOGGER.debug(f"Calculating weightedSum for '{rightLabel}' from {dataFrame.rightWeight}")
            weightedSum = 0.02 * dataFrame.rightWeight.sum()
            # LOGGER.debug(f"weightedSum = '{weightedSum}'")
            myD["bottom"] = bottomWidth + weightedSum
            myD["top"] = myD["bottom"] + myD["right"]
            topEdge = myD["top"]
        rightWidths[rightLabel] = myD
        LOGGER.debug(f"Right position of '{rightLabel}' : {myD} ")
    right_max = max(myD.values())
    
# Total vertical extent of diagram
    xMax = topEdge / aspect

    new_pos=abs(right_max-left_max)/2.0

    if (left_max>right_max):
        for key,val in rightWidths.items():
            rightWidths[key]["bottom"] = val["bottom"] + new_pos 
            rightWidths[key]["top"] = val["top"] + new_pos 
    elif (right_max>left_max):
        for key,val in leftWidths.items():
            leftWidths[key]["bottom"] = val["bottom"] + new_pos 
            leftWidths[key]["top"] = val["top"] + new_pos 
            
    # Draw vertical bars on left and right of each  label's section & print label
    for leftLabel in leftLabels:
        plt.fill_between(
            [-0.02 * xMax, 0],
            2 * [leftWidths[leftLabel]["bottom"]],
            2 * [leftWidths[leftLabel]["bottom"] + leftWidths[leftLabel]["left"]],
            color=colorDict[leftLabel],
            alpha=0.99,
        )
        plt.text(
            -0.05 * xMax,
            leftWidths[leftLabel]["bottom"] + 0.5 * leftWidths[leftLabel]["left"],
            leftLabel,
            {"ha": "right", "va": "center"},
            fontsize=fontsize,
        )
        
    for rightLabel in rightLabels:
        plt.fill_between(
            [xMax, 1.02 * xMax],
            2 * [rightWidths[rightLabel]["bottom"]],
            2 * [rightWidths[rightLabel]["bottom"] + rightWidths[rightLabel]["right"]],
            color=colorDict[rightLabel],
            alpha=0.99,
        )
        plt.text(
            1.05 * xMax,
            rightWidths[rightLabel]["bottom"] + 0.5 * rightWidths[rightLabel]["right"],
            rightLabel.upper(),
            {"ha": "left", "va": "center"},
            fontsize=fontsize,
        )

    # Plot strips
    for leftLabel in leftLabels:
        for rightLabel in rightLabels:
            labelColor = leftLabel
            if rightColor:
                labelColor = rightLabel
            if (
                len(
                    dataFrame[
                        (dataFrame.left == leftLabel) & (dataFrame.right == rightLabel)
                    ]
                )
                > 0
            ):
                # Create array of y values for each strip, half at left value,
                # half at right, convolve
                ys_d = np.array(
                    50 * [leftWidths[leftLabel]["bottom"]]
                    + 50 * [rightWidths[rightLabel]["bottom"]]
                )
                ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode="valid")
                ys_d = np.convolve(ys_d, 0.05 * np.ones(20), mode="valid")
                ys_u = np.array(
                    50 * [leftWidths[leftLabel]["bottom"] + ns_l[leftLabel][rightLabel]]
                    + 50
                    * [rightWidths[rightLabel]["bottom"] + ns_r[leftLabel][rightLabel]]
                )
                ys_u = np.convolve(ys_u, 0.05 * np.ones(20), mode="valid")
                ys_u = np.convolve(ys_u, 0.05 * np.ones(20), mode="valid")

                # Update bottom edges at each label so next strip starts at the right place
                leftWidths[leftLabel]["bottom"] += ns_l[leftLabel][rightLabel]
                rightWidths[rightLabel]["bottom"] += ns_r[leftLabel][rightLabel]
                plt.fill_between(
                    np.linspace(0, xMax, len(ys_d)),
                    ys_d,
                    ys_u,
                    alpha=0.65,
                    color=colorDict[labelColor],
                )
    plt.gca().axis("off")
    plt.gcf().set_size_inches(figSize)
    #text = "$\it{FuzzyCSar}$ SPAIN: $\it{Sankey}$ $\it{Diagram.}$"+str(datatitle)+" datos analizados"
    #text = "$\it{FuzzyCSar}$ USA: $\it{Sankey}$ $\it{Diagram.}$"+str(datatitle)+" datos analizados"
    #text = "$\it{IncMine}$ SPAIN: $\it{Sankey}$ $\it{Diagram.}$"+str(datatitle)+" datos analizados"
    #text = "$\it{IncMine}$ USA: $\it{Sankey}$ $\it{Diagram.}$"+str(datatitle)+" datos analizados"
    
    plt.title(text,fontdict = {'fontsize' : 30})
    #plt.title('f model: T= {}'.format(t))
    if figureName != None:
        fileName = "{}.png".format(figureName)
        plt.savefig(fileName, bbox_inches="tight", dpi=150)
        LOGGER.info(f"Sankey diagram generated in '{fileName}'")
    if closePlot:
        plt.close()

In [69]:
import pandas as pd
import seaborn as sns

for i in range(487,4870+487,487):
#for i in range(5600,252000+5600,5600):
#    if i==252000: #Si es el ultimo avanzamos un poco mas en la lectura
#        i=250147
    input_path = './USA_FuzzyCSar/usa_rules_count'+str(i)+'.csv'
    #input_path = './Spain_FuzzyCSar/spain_rules_count'+str(i)+'.csv'
    df=pd.read_csv(input_path)
    colorDict = {
        'Positive':'#008000',
        'Negative':'#b20000',
        #'Neutral':'#e6c200'
        'Neither':'#e6c200'
    }
    
    ant_list = df['Elemento'].values.tolist()[:]
    ant_list.sort()
    leftvalues=ant_list
    pal = sns.color_palette("cubehelix",len(ant_list))
    pal_list = pal.as_hex()
    colorDict.update(zip(ant_list,pal_list))

    left=df["Elemento"].tolist()
    right=df['Sentimiento'].tolist()
    
    #fig_name="./Spain_FuzzyCSar/Sankey/Sankey_spain_{:06d}".format(i)
    fig_name="./USA_FuzzyCSar/Sankey/Sankey_usa_{:06d}".format(i)
    weight = df['Valor'].astype(float)
    sankeyPlot(left=left, right=right,datatitle=i,colorDict=colorDict,
            leftWeight=weight,rightWeight=weight,aspect=20, fontsize=13,figureName=fig_name)



In [62]:
import pandas as pd
import seaborn as sns

for i in range(0,10):
#for i in range(0,45):
#    k=(i+1)*5600
#    if k==252000: #Si es el ultimo avanzamos un poco mas en la lectura
#        k=250147
    #input_path = './Spain_IncMine/spain_rules_count'+str(i)+'.csv'
    input_path = './USA_IncMine/usa_rules_count'+str(i)+'.csv'
    df=pd.read_csv(input_path)

    colorDict = {
        'Positive':'#008000',
        'Negative':'#b20000',
        #'Neutral':'#e6c200'
        'Neither':'#e6c200'
    }
    
    ant_list = df['Elemento'].values.tolist()[:]
    ant_list.sort()
    leftvalues=ant_list
    pal = sns.color_palette("cubehelix",len(ant_list))
    pal_list = pal.as_hex()
    colorDict.update(zip(ant_list,pal_list))
    
    
    fig_name="./USA_IncMine/Sankey/Sankey_usa_{:04d}".format((i+1)*487)
    #fig_name="./Spain_IncMine/Sankey/Sankey_spain_{:06d}".format(k)
    weight = df['Valor'].values.astype(float)
    
    left=df["Elemento"].tolist()
    right=df['Sentimiento'].tolist()
    
    
    sankeyPlot(left=left, right=right,datatitle=(i+1)*487,colorDict=colorDict,
            leftWeight=weight,rightWeight=weight,aspect=20, fontsize=13,figureName=fig_name)

