# Exploratory Data Analysis

## Imports and global declarations

In [162]:
import numpy as np
import pandas as pd
import plotly.express as px
from bioinfokit.analys import stat, get_data
import plotly.offline as py_offline


## Read data from raw files

In [99]:
exp_results_test = pd.read_csv('../data/raw/exp_results_test.csv')  
exp_results_control = pd.read_csv('../data/raw/exp_results_control.csv')

## Analysis

### Check data

In [100]:
exp_results_test.sort_values(by=['conversion_rate'])

Unnamed: 0,Index,cluster_label,population_cnt,valid_cnt,available_cnt,population_percentage,valid_percentage,available_percentage,exposed_cnt,exposed_percentage,conversion_cnt,conversion_percentage,conversion_rate
0,0,Elite,1999,1507,1025,0.04,0.04,0.06,87,0.03,12,0.02,13.79
1,1,High Quality,9847,7665,4371,0.21,0.21,0.27,606,0.18,126,0.16,20.79
2,2,Average,12538,10309,3592,0.27,0.28,0.22,1143,0.33,244,0.32,21.35
3,3,Low Quality,11446,9077,3820,0.24,0.24,0.23,865,0.25,203,0.27,23.47
4,4,Low Experience,10931,8679,3569,0.23,0.23,0.22,748,0.22,180,0.24,24.06


In [101]:
exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality"])]

Unnamed: 0,Index,cluster_label,population_cnt,valid_cnt,available_cnt,population_percentage,valid_percentage,available_percentage,exposed_cnt,exposed_percentage,conversion_cnt,conversion_percentage,conversion_rate
0,0,Elite,1999,1507,1025,0.04,0.04,0.06,87,0.03,12,0.02,13.79
1,1,High Quality,9847,7665,4371,0.21,0.21,0.27,606,0.18,126,0.16,20.79


In [102]:
exp_results_control.sort_values(by=['conversion_rate'])

Unnamed: 0,Index,cluster_label,population_cnt,valid_cnt,available_cnt,population_percentage,valid_percentage,available_percentage,exposed_cnt,exposed_percentage,conversion_cnt,conversion_percentage,conversion_rate
0,0,High Quality,9847,7665,4371,0.21,0.21,0.27,510,0.16,41,0.14,8.04
1,1,Low Quality,11446,9077,3820,0.24,0.24,0.23,884,0.27,79,0.27,8.94
2,2,Average,12538,10309,3592,0.27,0.28,0.22,1058,0.33,97,0.33,9.17
3,3,Low Experience,10931,8679,3569,0.23,0.23,0.22,702,0.22,68,0.23,9.69
4,4,Elite,1999,1507,1025,0.04,0.04,0.06,85,0.03,9,0.03,10.59


### Chi-square test: To compare pupulation and exposed distibutions
- The H0 (Null Hypothesis): There is no relationship between population and exposed distibution.
- The H1 (Alternative Hypothesis): There is a relationship between  population and exposed distibution.

In [103]:
#prepare data
cq_df = exp_results_test[["cluster_label","population_cnt","exposed_cnt"]]
cq_df = cq_df.set_index('cluster_label')

# output
print(cq_df)

# run chi-square test for independence
res = stat()
res.chisq(df=cq_df)

# output
print(res.summary)

# corrected for the Yates’ continuity Chi-squared test for independence
print(res.expected_df)


                population_cnt  exposed_cnt
cluster_label                              
Elite                     1999           87
High Quality              9847          606
Average                  12538         1143
Low Quality              11446          865
Low Experience           10931          748

Chi-squared test for independence

Test              Df    Chi-square      P-value
--------------  ----  ------------  -----------
Pearson            4       94.1192  1.75406e-19
Log-likelihood     4       95.9686  7.09148e-20


Expected frequency counts

      population_cnt    exposed_cnt
--  ----------------  -------------
 0           1942.71        143.29
 1           9734.97        718.032
 2          12741.2         939.768
 3          11465.3         845.661
 4          10876.8         802.248



### Visualize conversion rate

In [170]:
fig = px.bar(exp_results_test, x='cluster_label', y='conversion_rate', template="simple_white", color = "cluster_label", text_auto='.4s',
                category_orders={"cluster_label": ["Low Experience", "Low Quality", "Average", "High Quality", "Elite"]})
fig.update_layout(
    title="Devs Exposed to What's App Update",
    xaxis_title="Developer Quality Segments",
    yaxis_title="Availability Conversion Rate", 
    showlegend=False,
    yaxis_ticksuffix="%",
    font=dict(
        size=18    
        ))
fig.show()

data = [fig]
# Output html that you can copy paste
#fig.write_html("../reports/figures/whatsapp_percentage_conv.html", full_html=False, include_plotlyjs='cdn')

In [105]:
fig = px.bar(exp_results_test, x='cluster_label', y='exposed_cnt')
fig.show()

In [171]:

fig = px.bar(exp_results_control, x='cluster_label', y='conversion_rate', template="simple_white", color = "cluster_label", text_auto='.4s',
                category_orders={"cluster_label": ["Low Experience", "Low Quality", "Average", "High Quality", "Elite"]})
fig.update_layout(
    title="Devs Exposed to AMP",
    xaxis_title="Developer Quality Segments",
    yaxis_title="Availability Conversion Rate", 
    showlegend=False,
    yaxis_ticksuffix="%",
    font=dict(
        size=18    
        ))
fig.show()

data = [fig]

In [107]:
fig = px.bar(exp_results_control, x='cluster_label', y='exposed_cnt')
fig.show()

## Calculation

In [120]:
print("Total p2 pool: " + str(exp_results_test.population_cnt.sum()))
print("Total Valid p2 pool: " + str(exp_results_test.valid_cnt.sum()))
print("Total Available p2 pool: " + str(round(exp_results_test.available_cnt.sum())))
print("Total Quality Available p2 pool: " + str(round(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality"])].available_cnt.sum())))

print("Additional Available Developers if Whats App launched: " +  str(round(exp_results_test.valid_cnt.dot(exp_results_test.conversion_rate/100)))) 
print("Additional Available Developers if Whats App launched all but Low Experience: " +  str(round(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality","Average","Low Quality"])].valid_cnt.dot(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality","Average","Low Quality"])].conversion_rate/100)))) 
print("Additional Available Developers if Whats App launched all but Low Experience & Low Quality: " +  str(round(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality","Average"])].valid_cnt.dot(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality","Average"])].conversion_rate/100)))) 

print("Additional Available Developers if Whats App Not launched: " +  str(round(exp_results_control.valid_cnt.dot(exp_results_control.conversion_rate/100)))) 
print("Additional Quality Available Developers if Whats App launched: " +  str(round(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality"])].valid_cnt.dot(exp_results_test[exp_results_test.cluster_label.isin(["Elite","High Quality"])].conversion_rate/100)))) 
print("Additional Quality Available Developers if Whats App Not launched: " +  str(round(exp_results_control[exp_results_control.cluster_label.isin(["Elite","High Quality"])].valid_cnt.dot(exp_results_control[exp_results_control.cluster_label.isin(["Elite","High Quality"])].conversion_rate/100)))) 

Total p2 pool: 46761
Total Valid p2 pool: 37237
Total Available p2 pool: 16377
Total Quality Available p2 pool: 5396
Additional Available Developers if Whats App launched: 8221
Additional Available Developers if Whats App launched all but Low Experience: 6133
Additional Available Developers if Whats App launched all but Low Experience & Low Quality: 4002
Additional Available Developers if Whats App Not launched: 3374
Additional Quality Available Developers if Whats App launched: 1801
Additional Quality Available Developers if Whats App Not launched: 776


In [112]:
round((5396+776)/(16377+3374),2)

0.31

In [123]:
round((5396+1801)/(16377+8221),4)

0.2926