In [90]:
# Importar los paquetes necesarios
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [91]:
# Read data
data = pd.read_csv('datos.csv')



In [92]:
# Clean data

# Problems with errors
zero_time = np.where(data['notape'] <= 1e-17)
print("Datos con errores")
print(zero_time)
data.drop(zero_time[0], inplace = True)

# Fixing columns names
data.rename(columns={
    key: key.strip() for key in data.keys()
    }, inplace = True)

# Fixing string values
for col in data.columns:
    if type(data[col][0]) == str:
        data[col] = [val.strip() for val in data[col]]
        

Datos con errores
(array([], dtype=int64),)


In [93]:
def describe(data):
    print(data.describe())
    print(data.info())
    
    plot_boxplot(data, 'ratio')

def plot_boxplot(data, key:str):
    """
    Plot a boxplot using the dataframe indexing on key
    """
    plt.boxplot(data[key])
    plt.title(f"Box Plot {key}")
    plt.legend([key])
    plt.show()

def iqr(data, key):
    """
    Calculates the IQR from the data associated with key
    """
    Q1 = np.percentile(data[key], 25,
                interpolation = 'midpoint')
  
    Q3 = np.percentile(data[key], 75,
                    interpolation = 'midpoint')
    IQR = Q3 - Q1
    return Q1, Q3, IQR

def get_outliers(data, key, upper=True):
    """
    Returns the outliers of data[key]
    """
    Q1, Q3, IQR = iqr(data,key)
    if upper:
        return data[data[key] >= Q3 + IQR*1.5]
    return data[data[key] <= Q1 - IQR*1.5]

def remove_outliers(data, key, remove_upper=True, remove_lower=True):
    """
    Returns the data in key without the outliers 
    """
    base_data = data
    if remove_upper:
        base_data = base_data[~base_data.isin(get_outliers(data, key))]
    if remove_lower:
        base_data = base_data[~base_data.isin(get_outliers(data, key, False))]
    return base_data

def plot_corr(df,size=10):
    """
    Function plots a graphical correlation matrix
    for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot
    """

    corr = df.corr()
    corr.style.background_gradient(cmap='coolwarm').set_precision(2)
    # fig, ax = plt.subplots(figsize=(size, size))
    # ax.matshow(corr)
    # plt.xticks(range(len(corr.columns)), corr.columns)
    # plt.yticks(range(len(corr.columns)), corr.columns)
    
    sns.heatmap(corr,
            cmap='coolwarm',
            annot=True,
            )

def plot_scatter_matrix(data, keys:tuple):
    pd.plotting.scatter_matrix(data.loc[:,keys])
    
def anova(data, factor, objetive, alpha=0.1):
    # TODO Check for assumptions
    import scipy.stats as stats
    levels = set(data[factor])
    levels = [data[objetive][data[factor] == level] for level in levels]
    result = stats.f_oneway(*levels)
    if result.pvalue < alpha:
        # H0 is rejected
        print(f"ANOVA: {factor} influences {objetive}")
    else:
        print(f"ANOVA: {factor} does not influences {objetive}")
    print(result.pvalue)
    
# plot_corr(data)
# get_outliers(data, 'ratio').describe()
# get_outliers(data, 'ratio', False).describe()

# remove_outliers(data, "ratio").describe()
# data.describe()
# plot_corr(data)
# plot_scatter_matrix(data, ('notape', 'tape'))
# anova(data, "problem", "ratio")



In [97]:
p1 = data[data['problem'] == "A-N32-K5"]
p1.head()

Unnamed: 0,problem,criterion,notape,tape,ratio,routes,iterations,clients,maxroutes,current,total
719,A-N32-K5,RAB,0.001,0.012,12.0,17,5,32,1,1,30
720,A-N32-K5,RAB,0.003,0.009,3.0,13,7,32,1,2,30
721,A-N32-K5,RAB,0.002,0.003,1.5,19,5,32,1,3,30
722,A-N32-K5,RAB,0.001,0.003,3.0,18,5,32,1,4,30
723,A-N32-K5,RAB,0.003,0.003,1.0,16,5,32,1,5,30
