In [None]:
import warnings

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

plt.rcParams['figure.figsize'] = (20, 6)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statscipyuistical Society 42 (2013): 323-328
    """
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

def get_association(df, var1, var2):
    return cramers_corrected_stat(
        df
        [['id', var1, var2]]
        .groupby([var1, var2], as_index=False)
        [['id']]
        .count()
        .pivot(index=var1, columns=var2, values='id')
        .to_numpy()
    )

In [None]:
df = pd.read_csv('data/car_features.csv').drop(['title', 'seller_description', 'other_features', 'link', 'image'], axis=1)

##### Anúncios por montadora/modelo

In [None]:
(
    df['maker']
    .value_counts()
    .to_frame()
    .reset_index()
    .head(25)
    .assign(maker = lambda x: np.where(x['maker'] == 'mercedes-benz', 'mercedes', x['maker']))
    .pipe(sns.barplot, x='maker', y='count')
);

In [None]:
plot = (
    df
    ['model']
    .value_counts()
    .reset_index()
    .query("count >= 100")
    .pipe(sns.barplot, x='model', y='count')
)
plt.xticks(rotation=90);

In [None]:
top_listings = (
    df
    ['model']
    .value_counts()
    .reset_index()
    .query("count >= 20")
    ['model']
)

df = df.query("model in @top_listings").query("price <= 500000")

In [None]:
plot = sns.boxplot(df, x="model", y='price')
plot.axhline(y=50000, color='red', linestyle='--', linewidth=1)
plt.xticks(rotation=90)
plt.show();

In [None]:
sns.boxplot(df.dropna(subset='year').query("year >= 2000"), x='year', y='price')

In [None]:
corr_df = pd.concat([df.iloc[:, [0]], df.iloc[:, 14:]], axis=1)
corr = df.iloc[:, 14:].corr()

for i in range(1, corr_df.shape[1]):
    for j in range(1, corr_df.shape[1]):

        if corr_df.columns[j] == corr_df.columns[i]:
            corr[j-1, i-1] = 1
        else:
            corr[j-1, i-1] = get_association(df, corr_df.columns[i], corr_df.columns[j])

corr = corr.iloc[:, 0:27]

fig = go.Figure(
    data=go.Heatmap(
        z=corr.values, 
        x=corr.columns,  
        y=corr.index,  
        colorscale='Viridis',  
        zmin=-1, zmax=1  
    )
)

fig.update_layout(
    title='Correlation Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    width=900,
    height=900
)

fig.show()

In [None]:
aux = df.copy()
discrete_vars = df.iloc[:, 14:].columns

fig, axes = plt.subplots(4, 7, figsize=(20, 15))
axes = axes.flatten()

for i in range(len(discrete_vars)):
    aux[discrete_vars[i]] = np.where(aux[discrete_vars[i]] == 1, 'Sim', 'Não')
    
    sns.boxplot(
        data=aux.query("price <= 100000"),
        x=discrete_vars[i],  
        y='price',  
        ax=axes[i]
    )
    axes[i].set_title(discrete_vars[i])

axes[27].axis('off')

plt.tight_layout()
plt.show()

In [None]:
bad_models = ['fusca', 'parati', 'palio weekend', 'uno', 'celta', 'rampage', 'commander', 'taos',
              'l 200 triton', 'hilux sw4', 'hilux caminhonete', 'montana', 'meriva', 'clio hatch',
              'ecosport', 'palio', 'escort', 'rav 4', 'pajero full', 'hr-v', 'range rover evoque',
              'toro', 'f-250', 'march', 'ranger', 's10', 'santa fe', 'freelander', 'f-1000',
              'renegade', 'compass', 'corolla cross', 'city hatch', 'captiva sport', 'crv',
              'gol', 'corsa hatch', 'ka hatch', 'corsa sedan', 'voyage', 'saveiro', 'passat',
              'strada', 'opala', 'zafira', 'idea', 'astra hatch', 'santana', 'omega', 'captur',
              'astra sedan', 'vectra sedan', 'sportage', 'doblo', 'fox', 'space fox',
              'kombi', 'fiorino furgão', 'kwid', 'pajero tr4', 'tucson', 'vectra hatch',
              'duster', 'tracker', 'santa fé', 'freelander 2', 'freemont', 'outlander',
              'amarok', 'siena', 'golf', 'crossfox', 'agile', 'mobi', 'soul']
plot = sns.boxplot(df.query("model not in @bad_models"), x="model", y='price')
plot.axhline(y=50000, color='red', linestyle='--', linewidth=1)
plt.xticks(rotation=90)
plt.show();