In [1]:
import pandas as pd
from scipy import stats
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("dataset/refined.tsv", sep="\t")

In [3]:
df.head(5)

Unnamed: 0,bool_cert,provedor,maior_desafio,tam_equipe,avaliacao,porte,tam_investimento,segmentacao,bool_experiencia,tempo,bool_mtd_scrum,bool_mtd_kanban,bool_mtd_okr,bool_mtd_safe,bool_mtd_classica,bool_src_sql,bool_src_api,bool_src_nosql,bool_src_outros,bool_alt_eqp,bool_alt_bd,bool_alt_sol,bool_alt_custo,bool_alt_tec,bool_falha_gov,bool_falha_dev,bool_falha_aus_info,bool_falha_pln_custo,bool_falha_gst_lid,bool_falha_usu,qt_alteracoes,qt_falhas,bool_terceiro,bool_dev
0,1,AWS,Variedade,"[0,5]",5,"]500,+∞]","[0,10]",B2B,1,"[0,3]",1,1,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,2,0,1,1
1,1,AWS,Velocidade,"]10,20]",4,"]100,500]","[-∞,+∞]",B2B,1,"]3,6]",1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,1,0
2,1,Azure,Variedade,"]10,20]",3,"]20,50]","]20,30]",B2ALL,1,"]6,12]",0,0,0,1,0,1,0,1,0,0,1,1,1,0,1,0,1,1,1,0,3,4,0,1
3,0,AWS,Volume,"]5,10]",5,"[0,20]","]10,20]",B2B,1,"]3,6]",1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,2,0,1,1
4,1,Azure,Velocidade,"[0,5]",4,"]100,500]","]50,100]",B2B,1,"]6,12]",0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### p test

In [10]:
def p_value_from_t_test(column: str, metric_column: str) -> float:
    g1 = list(df.query(f"{column} == 1")[metric_column])
    g2 = list(df.query(f"{column} == 0")[metric_column])
    return float("{:.6f}".format(float(stats.ttest_ind(g1,g2)[1]))) # p_value

#### anova for columns with 3 groups or more

In [11]:
def p_value_from_anova(column: str, metric_column: str) -> float:
    return float("{:.6f}".format(float(stats.f_oneway(
        *[  # list comp
            df.query(f"{column} == '{group}'")[{metric_column}] for group in df[column].unique()
        ]
    )[1][0])))

In [12]:
bool_columns = [col for col in df.columns if "bool" in col]
metrics = ["avaliacao", "qt_falhas", "qt_alteracoes"]
p_dataset = []
columns_with_more_than_3_groups = ["maior_desafio", "maior_desafio", "tam_equipe", "porte", "tam_investimento", "tempo", "provedor", "segmentacao"]

# p test
for col in bool_columns:
    p_dataset.append([col] + [p_value_from_t_test(col, metric) for metric in metrics])
    
# anova
for col in columns_with_more_than_3_groups:
    p_dataset.append([col] + [p_value_from_anova(col, metric) for metric in metrics])

In [13]:
df_p_values = pd.DataFrame(data=p_dataset, columns=pd.MultiIndex.from_product([["p_value"], ["column"] + metrics]))

In [14]:
def p_value_style(x):
    try:
        if float(x) < 0.05:
            return "background-color: #FFDEAD"
    except:
        return None

In [9]:
df_p_values["p_value"].sort_values(by="avaliacao").style.applymap(p_value_style)

Unnamed: 0,column,avaliacao,qt_falhas,qt_alteracoes
20,bool_falha_gst_lid,1e-06,0.0,0.005191
2,bool_mtd_scrum,2e-06,1e-05,0.163063
21,bool_falha_usu,2e-06,0.0,0.000166
18,bool_falha_aus_info,4e-06,0.0,0.002015
17,bool_falha_dev,1.1e-05,0.0,0.014144
0,bool_cert,2.4e-05,0.00013,0.167597
1,bool_experiencia,4.7e-05,0.000639,0.197176
6,bool_mtd_classica,5.5e-05,0.003872,0.667511
19,bool_falha_pln_custo,0.000326,0.0,0.050042
26,tam_equipe,0.00043,0.002104,0.01262


In [17]:
g1 = list(df.query(f"bool_cert == 1")["avaliacao"])
g2 = list(df.query(f"bool_cert == 0")["avaliacao"])

stats.ttest_ind(g1,g2,

Ttest_indResult(statistic=4.426563413593539, pvalue=2.4464582444796605e-05)

In [23]:
float(stats.ttest_ind(g2,g1)[1])

2.4464582444796605e-05

In [28]:
g1 = [5,4,3,4,5,4,5,5,4,5,5,4,3,4,5,4,3,4,5,3,4,4,5,4,4,4,2,2,4,4,3,4,4,5,5,4,4,3,5,5,5,5,5,4,4,5,3,5,4,4,4,5,4,4,5,4,4,5,5,5,5,4,5,5,4,5,3]
g2 = [5,4,4,3,3,4,4,3,5,4,1,4,4,4,3,4,3,5,2,2,2,3,4,3,4,3,4,4,3,4,4,2,1,4,4]
stats.ttest_ind(g2,g1,equal_var=False)

Ttest_indResult(statistic=-4.078975466504175, pvalue=0.0001459057337301082)

In [31]:
len(g1) + len(g2)

102

In [27]:
print(df_p_values["p_value"].query("avaliacao < 0.05").sort_values(by="avaliacao")["column"].to_string())

20      bool_falha_gst_lid
21          bool_falha_usu
17          bool_falha_dev
18     bool_falha_aus_info
0                bool_cert
2           bool_mtd_scrum
26              tam_equipe
1         bool_experiencia
16          bool_falha_gov
6        bool_mtd_classica
12             bool_alt_bd
19    bool_falha_pln_custo
11            bool_alt_eqp
3          bool_mtd_kanban
22           bool_terceiro
14          bool_alt_custo
24           maior_desafio
25           maior_desafio
28        tam_investimento
27                   porte
13            bool_alt_sol
