# LIBS

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# UTILS

## FUCTIONS

In [2]:
def get_values_statistics(df, column: str)-> tuple:
    max_value = df[column].max()
    min_value = df[column].min()
    mean_value = df[column].mean()
    return (max_value, min_value, mean_value)

def boxplot(df, column: str, title: str = '')-> None:
    plt.figure(figsize=(6, 3))
    plt.boxplot(df[column], vert=False)
    plt.title(title)
    plt.show()

def violinplot(df, column: str, title: str = '', x_range=None)-> None:
    plt.figure(figsize=(6, 3))
    plt.violinplot(df[column], vert=False)
    plt.xlabel(column)
    plt.title(title)
    if x_range is not None:
        plt.xlim(x_range)
        
    plt.show()

def scatter_plot(df, column_x: str, column_y: str, label_x=None, label_y=None, title='Gráfico de Dispersão', x_range=None, y_range=None):
    if label_x is None:
        label_x = column_x

    if label_y is None:
        label_y = column_y

    plt.figure(figsize=(10, 4))
    plt.scatter(df[column_x], df[column_y])
    plt.xlabel(label_x)
    plt.ylabel(label_y)
    plt.title(title)
    
    if x_range is not None:
        plt.xlim(x_range)
    
    if y_range is not None:
        plt.ylim(y_range)
        
    plt.grid(True)
    plt.show()
    
def bar_plot(df, column_x: str, column_y: str, label_x: str=None, label_y: str=None, title: str='Gráfico de Barras')->None:
    if label_x is None:
        label_x = column_x

    if label_y is None:
        label_y = column_y

    plt.figure(figsize=(10, 6))
    plt.bar(df[column_x], df[column_y])
    plt.xlabel(label_x)
    plt.ylabel(label_y)
    plt.title(title)
    plt.xticks(rotation=90)  
    plt.grid(True)
    plt.show()

def bar_plot_custom_intervals(data, column: str, intervals=None, interval_size=50, xlabel='Intervalos', ylabel='Contagem', title='Contagem de Valores por Intervalo'):
    
    data = data.dropna(subset=[column])
    data[column] = data[column].astype(int)

    if intervals:
        custom_intervals = pd.IntervalIndex.from_tuples(intervals)
        bins = custom_intervals
    else:
        bins = range(0, data[column].max() + interval_size, interval_size)

    intervals = pd.cut(data[column], bins=bins)
    counts = intervals.value_counts().sort_index()

    plt.figure(figsize=(10, 6))
    counts.plot(kind='bar', width=0.8)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.grid(False)
    
    for i, val in enumerate(counts):
        plt.text(i, val + 0.2, str(val), ha='center', va='bottom')
    
    plt.show()
    

# ANALYSIS

## RESULTS ANALYSIS

#### TREATS  

In [3]:
resutls = pd.read_csv('../dataset/dados_tratados.csv', sep=';')
resutls.head()

FileNotFoundError: [Errno 2] No such file or directory: '../dataset/dados_tratados_2.csv'

In [None]:
print(f'Número de linhas: {resutls.shape[0]} - Número de colunas: {resutls.shape[1]} \n')
print('Total de valores vazios por colunas ')
resutls.isna().sum()

In [None]:
resutls.describe()

In [None]:
data = resutls.dropna()
print(f'Número de linhas descartadas: {resutls.shape[0] - data.shape[0]}  \n')

### GRAFHS

#### REALEASES ANALYSIS

In [None]:
releases_column = 'nº Releases'

max_cbo, min_cbo, mean_cbo = get_values_statistics(df=data, column=releases_column)
print(f'Para a coluna: {releases_column}')
print('MAX: {} - MIN: {} - mean: {:.2f}'.format(max_cbo, min_cbo, mean_cbo))

In [None]:
violinplot(df=data, column=releases_column, title='Violinplot: Releases')

In [None]:
scatter_plot(df=data, column_x=releases_column, column_y='Anos', title='Releases X Anos')

In [None]:
scatter_plot(df=data, column_x=releases_column, column_y='Estrelas', title='Releases X Estrelas')

In [None]:
scatter_plot(df=data, column_x=releases_column, column_y='média CBO', title='Releases X média CBO')

In [None]:
scatter_plot(df=data, column_x=releases_column, column_y='média LCOM', title='Releases X média LCOM')

In [None]:
scatter_plot(df=data, column_x=releases_column, column_y='DIT máximo', title='Releases X DIT máximo')

In [None]:
scatter_plot(df=data, column_y=releases_column, column_x='DIT máximo', title='DIT máximo x Releases', x_range=(0,100))

#### LOC ANALYSIS

In [None]:
loc_column = 'LOC Total'

max_dit, min_dit, mean_dit = get_values_statistics(df=data, column=loc_column)
print(f'Para a coluna: {loc_column}')
print('MAX: {} - MIN: {} - mean: {:.2f}'.format(max_dit, min_dit, mean_dit))

In [None]:
violinplot(data, column=loc_column, title='Violinplot: LOC Total')

In [None]:
violinplot(df=data, column=loc_column, title='Violinplot: Média LOC [0, 900000]', x_range=(-10000,900000))

#### CBO ANALYSIS

##### General

In [None]:
cbo_mean_column = 'média CBO'
cbo_standard_column = 'desvio padão CBO'

max_cbo, min_cbo, mean_cbo = get_values_statistics(df=data, column=cbo_mean_column)
print(f'Para a coluna: {cbo_mean_column}')
print('MAX: {:.2f} - MIN: {} - mean: {:.2f}'.format(max_cbo, min_cbo, mean_cbo))

max_std,min_std,mean_std = get_values_statistics(df=data, column=cbo_standard_column)
print(f'\nPara a coluna: {cbo_standard_column}')
print('MAX: {:.2f} - MIN: {} - mean: {:.2f}'.format(max_std,min_std,mean_std))



In [None]:
violinplot(df=data, column=cbo_mean_column, title='Violinplot: Média CBO')

##### CBO x Anos

In [None]:
scatter_plot(df=data, column_x=cbo_mean_column, column_y='Anos', title='CBO X Anos')

##### CBO x Estrelas

In [None]:
scatter_plot(df=data, column_x=cbo_mean_column, column_y='Estrelas', title='CBO X Estrelas')

##### CBO x Releases

In [None]:
scatter_plot(df=data, column_x=cbo_mean_column, column_y='nº Releases', title='CBO X nº Releases')

##### CBO x LOC 

In [None]:
scatter_plot(df=data, column_x=cbo_mean_column, column_y='LOC Total', title='CBO X LOC')

In [None]:
scatter_plot(df=data, column_x=cbo_mean_column, column_y='LOC Total', title='CBO X LOC [0, 2500000]', y_range=(-100000, 2500000))

#### LCOM ANALYSIS

##### GENERAL

In [None]:
lcom_mean_column = 'média LCOM'
lcom_standard_column = 'desvio padão LCOM'

max_cbo, min_cbo, mean_cbo = get_values_statistics(df=data, column=lcom_mean_column)
print(f'Para a coluna: {lcom_mean_column}')
print('MAX: {:.2f} - MIN: {} - mean: {:.2f}'.format(max_cbo, min_cbo, mean_cbo))

max_std,min_std,mean_std = get_values_statistics(df=data, column=lcom_standard_column)
print(f'\nPara a coluna: {lcom_standard_column}')
print('MAX: {:.2f} - MIN: {} - mean: {:.2f}'.format(max_std,min_std,mean_std))

In [None]:
violinplot(df=data, column=lcom_mean_column, title='Violinplot: Média LCOM')

In [None]:
violinplot(df=data, column=lcom_mean_column, title='Violinplot: Média LCOM [0, 10000]', x_range=(-1000,10000))

##### LCOM x ANOS

In [None]:
scatter_plot(df=data, column_x=lcom_mean_column, column_y='Anos', title='LCOM X Anos')

In [None]:
scatter_plot(df=data, column_x=lcom_mean_column, column_y='Anos', title='LCOM [0 - 100] X Anos', x_range=(-3,100))

##### LCOM X Estrelas

In [None]:
scatter_plot(df=data, column_x=lcom_mean_column, column_y='Estrelas', title='LCOM X Estrelas')

In [None]:
scatter_plot(df=data, column_x=lcom_mean_column, column_y='Estrelas', title='LCOM [0, 200] X Estrelas ', x_range=(0,200))

##### LCOM X Releases

In [None]:
scatter_plot(df=data, column_x=lcom_mean_column, column_y='nº Releases', title='LCOM X Releases')

In [None]:
scatter_plot(df=data, column_x=lcom_mean_column, column_y='nº Releases', title='LCOM [0 - 300] X nº Releases', x_range=(-10, 300))

##### LCOM x LOC

In [None]:
scatter_plot(data, column_x=lcom_mean_column, column_y='LOC Total', title='LCOM x LOC')

In [None]:
scatter_plot(data, column_x=lcom_mean_column, column_y='LOC Total', title='LCOM [0, 200] x LOC [0, 2500000]', x_range=(-10, 200), y_range=(-100000, 2500000))

#### DIT ANALYSIS

##### General

In [None]:
dit_column = 'DIT máximo'

max_dit, min_dit, mean_dit = get_values_statistics(df=data, column=dit_column)
print(f'Para a coluna: {dit_column}')
print('MAX: {} - MIN: {} - mean: {:.2f}'.format(max_dit, min_dit, mean_dit))

In [None]:

intervals = [(0, 10), (10, 20), (20, 30), (30, 50), (50, 100), (100,200), (200, 300),(300,500), (500,1000), (1000,2000), (2000, int(max_dit))]
bar_plot_custom_intervals(data=data, column='DIT máximo', intervals=intervals, title='Gráfico de DIT em intervalos')

In [None]:
violinplot(df=data, column=dit_column)

In [None]:
violinplot(df=data, column=dit_column, x_range=(-10, 200))

##### DIT x Anos

In [None]:
scatter_plot(df=data, column_x=dit_column, column_y='Anos', title='DIT x Anos')

In [None]:
scatter_plot(df=data, column_x=dit_column, column_y='Anos', title='DIT [0 - 100] x Anos', x_range=(0,100))

##### DIT x Estrelas

In [None]:
scatter_plot(df=data, column_x=dit_column, column_y='Estrelas', title='DIT x Estrelas')

In [None]:
scatter_plot(df=data, column_x=dit_column, column_y='Estrelas', title='DIT [0 - 100] x Estrelas', x_range=(0,100))

##### DIT x Releases

In [None]:
scatter_plot(data, column_x=dit_column, column_y='nº Releases', title='DIT x nº Releases')

In [None]:
scatter_plot(data, column_x=dit_column, column_y='nº Releases', title='DIT [0, 100] x nº Releases ', x_range=(-3,100))

##### DIT x LOC

In [None]:
scatter_plot(data, column_x=dit_column, column_y='LOC Total', title='DIT x LOC')

In [None]:
scatter_plot(data, column_x=dit_column, column_y='LOC Total', title='DIT [0, 150] x LOC [0 , 1000000]', x_range=(-10, 150), y_range=(-10000, 1000000))

## ERROR ANALYSIS

In [None]:
errors = pd.read_csv('../dataset/repo_with_erros.csv', sep=';')

print(f'Total de erros: {errors.shape[0]} \n')
errors.head()

In [None]:
type_errors = errors['error'].unique()
print('Erros encontrados ao rodar o CK nos repositórios clonados \n')
for error in type_errors:
    count = errors[errors['error'] == error].shape[0]  
    print(f'error: {error} || quant: {count}')