In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import sklearn as skl

import statsmodels.api as sm
from statsmodels.formula.api import ols

import scipy.stats as stats


In [2]:
df = pd.read_csv("dfanova_new.csv")

new_df = df[["Cluster", "rentabilidad_usd"]]

In [3]:
######################################## ANOVA Analysis ##############################################

# Pivot so that classes are the columns:

new_df = new_df.pivot(columns="Cluster", values="rentabilidad_usd")

new_df.head()


Cluster,0,1,2,3,4,5,6
0,,,,2436.0,,,
1,,,,,31150.0,,
2,-117.0,,,,,,
3,,,105.0,,,,
4,,,,,,,389873.0


In [4]:
series = []

for i in range(7):
    series.append(new_df[i])

In [5]:
# Drop NaNs

series[0] = [x for x in series[0] if pd.isnull(x) == False]
series[1] = [x for x in series[1] if pd.isnull(x) == False]
series[2] = [x for x in series[2] if pd.isnull(x) == False]
series[3] = [x for x in series[3] if pd.isnull(x) == False]
series[4] = [x for x in series[4] if pd.isnull(x) == False]
series[5] = [x for x in series[5] if pd.isnull(x) == False]
series[6] = [x for x in series[6] if pd.isnull(x) == False]


In [6]:
# Check for length:

for i in range(7):
    print(len(series[i]))

628
383
1242
674
1106
1221
909


In [7]:
good_df = pd.DataFrame(series)

good_df.head(7)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1232,1233,1234,1235,1236,1237,1238,1239,1240,1241
0,-117.0,2760.0,227.0,42728.0,249101.0,53463.0,47886.0,137913.0,28726.0,173.0,...,,,,,,,,,,
1,61662.0,344382.0,-56.0,39862.0,62862.0,71472.0,154.0,84435.0,65101.0,342393.0,...,,,,,,,,,,
2,105.0,58285.0,105955.0,425035.0,138310.0,53782.0,232530.0,10393.0,304.0,56120.0,...,90065.0,308735.0,11151.0,488088.0,66.0,14972.0,109.0,38936.0,7236.0,5920.0
3,2436.0,63065.0,42556.0,21834.0,16670.0,-1094.0,26271.0,155819.0,105043.0,42306.0,...,,,,,,,,,,
4,31150.0,15341.0,28368.0,42233.0,50357.0,7375.0,368619.0,9556.0,47846.0,-29970.0,...,,,,,,,,,,
5,7340.0,129827.0,26685.0,31455.0,151271.0,-16022.0,542629.0,90.0,466768.0,99459.0,...,,,,,,,,,,
6,389873.0,33029.0,29367.0,26201.0,719.0,229.0,11417.0,61118.0,1168.0,14816.0,...,,,,,,,,,,


In [8]:
good_df.shape

(7, 1242)

In [9]:
good_df = good_df.transpose()

In [10]:
good_df.shape

(1242, 7)

In [11]:
# Fill null values

for i in range(7):
    mean_value=good_df[i].mean()
    good_df[i].fillna(value=good_df[i].mean(), inplace=True)

In [12]:
print(good_df)

                  0              1         2             3             4  \
0       -117.000000   61662.000000     105.0   2436.000000  31150.000000   
1       2760.000000  344382.000000   58285.0  63065.000000  15341.000000   
2        227.000000     -56.000000  105955.0  42556.000000  28368.000000   
3      42728.000000   39862.000000  425035.0  21834.000000  42233.000000   
4     249101.000000   62862.000000  138310.0  16670.000000  50357.000000   
...             ...            ...       ...           ...           ...   
1237   63091.867834   69257.013055   14972.0  62756.879822  62820.547007   
1238   63091.867834   69257.013055     109.0  62756.879822  62820.547007   
1239   63091.867834   69257.013055   38936.0  62756.879822  62820.547007   
1240   63091.867834   69257.013055    7236.0  62756.879822  62820.547007   
1241   63091.867834   69257.013055    5920.0  62756.879822  62820.547007   

                  5             6  
0       7340.000000  389873.00000  
1     129827.00

In [13]:
# ANOVA analysis

fvalue, pvalue = stats.f_oneway(good_df[0], good_df[1], good_df[2], good_df[3], good_df[4], good_df[5], good_df[6])
print(fvalue, pvalue)

2.4380344747663147 0.023433893112453804


In [14]:
# Figure 23: SALE VALUE

# Replacing with average
# 2.9533474945171285 p = 0.007010181844102753

# Replacing with median
# 17.608762400176733 p = 2.223983671508818e-20


# Figure 24: PROFITABILITY

# Replacing with average
# 2.4380344747663147 0.023433893112453804

# Replacing with median
# 17.66123493384489 1.9146669966000232e-20


In [15]:
# Figure 25: Tukey test results for profitability

from statsmodels.stats.multicomp import pairwise_tukeyhsd

tukey = pairwise_tukeyhsd(endog=df['rentabilidad_usd'],
                          groups=df['Cluster'],
                          alpha=0.05)

print(tukey)

     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
group1 group2   meandiff  p-adj     lower      upper    reject
--------------------------------------------------------------
     0      1   6165.1452    0.9 -16149.1271 28479.4175  False
     0      2   2628.7103    0.9 -14223.8754 19481.2959  False
     0      3    -334.988    0.9 -19423.9398 18753.9638  False
     0      4   -271.3208    0.9 -17468.3537  16925.712  False
     0      5  -3974.3036    0.9 -20875.4888 12926.8816  False
     0      6   8765.4534 0.7483  -9093.7272  26624.634  False
     1      2   -3536.435    0.9 -23652.9551 16580.0852  False
     1      3  -6500.1332    0.9  -28524.048 15523.7815  False
     1      4   -6436.466    0.9 -26842.4132 13969.4811  False
     1      5 -10139.4488 0.7271 -30296.7006 10017.8031  False
     1      6   2600.3082    0.9 -18366.6915 23567.3078  False
     2      3  -2963.6983    0.9 -19429.8975 13502.5009  False
     2      4  -2900.0311    0.9 -17129.7737 11329.7115

In [16]:
# Figure 26: Tukey test for sale price

tukey = pairwise_tukeyhsd(endog=df['venta_usd'],
                          groups=df['Cluster'],
                          alpha=0.05)

print(tukey)

     Multiple Comparison of Means - Tukey HSD, FWER=0.05      
group1 group2   meandiff  p-adj     lower      upper    reject
--------------------------------------------------------------
     0      1   2747.8362    0.9 -26023.8346  31519.507  False
     0      2  -1643.5268    0.9 -23372.9849 20085.9314  False
     0      3  -7662.4617    0.9 -32275.4551 16950.5318  False
     0      4   4252.9501    0.9 -17920.6328 26426.5331  False
     0      5 -10801.9316 0.7399 -32594.0534 10990.1901  False
     0      6   5739.3504    0.9 -17287.9954 28766.6963  False
     1      2  -4391.3629    0.9 -30329.2868  21546.561  False
     1      3 -10410.2979    0.9 -38807.5863 17986.9906  False
     1      4   1505.1139    0.9 -24805.9926 27816.2204  False
     1      5 -13549.7678 0.6954 -39540.2105 12440.6749  False
     1      6   2991.5143    0.9 -24043.0044  30026.033  False
     2      3  -6018.9349    0.9 -27250.1925 15212.3227  False
     2      4   5896.4769    0.9 -12451.1288 24244.0826