In [None]:
import pandas as pd
rank_df = pd.read_csv(filepath_or_buffer='/kaggle/input/dog-breeds/breed_rank.csv')
rank_df.head()

In [None]:
traits_df = pd.read_csv(filepath_or_buffer='/kaggle/input/dog-breeds/breed_traits.csv')
traits_df.head()

In [None]:
long_df = pd.read_csv(filepath_or_buffer='/kaggle/input/dog-breeds/breed_traits_long.csv')
long_df.head(n=10)

In [None]:
long_numerical_df = long_df[~long_df['Trait'].isin({'Coat Type', 'Coat Length'})].copy()
long_numerical_df['score'] = long_numerical_df['Trait_Score'].astype(int) 
short_df = long_numerical_df[['Breed', 'score']].groupby(by='Breed').sum().reset_index()
short_df.shape

It may be dubious to assume that all of the scores are positive, in that 5 is more desirable than 1, and that we should weight them equally, but that's what we're going to do here.

In [None]:
df = rank_df[['Breed', '2020 Rank']].merge(right=short_df, on='Breed', how='inner').merge(right=traits_df[['Breed', 'Coat Type', 'Coat Length']],
                                                                                         on='Breed', how='inner')
# here we want to convert the ranking, where lower is better, into something where larger is better
df['popularity'] = 1 + df['2020 Rank'].max() - df['2020 Rank']
df.head()

In [None]:
from plotly.express import scatter
scatter(data_frame=df, x='score', y='popularity', hover_name='Breed', symbol='Coat Type', color='Coat Length',
       trendline='ols', trendline_scope='overall')

Our trendline is upward sloping, so we can very broadly say that objective measures as described are positively correlated with popularity, but our OLS has an r2 of about 0.1, so maybe it would be more accurate to say there's no accounting for taste.

In [None]:
data_df = rank_df[['Breed', '2020 Rank']].merge(right=traits_df.drop(columns={'Coat Type', 'Coat Length'}), on='Breed', how='inner')
data_df['popularity'] = 1 + data_df['2020 Rank'].max() - data_df['2020 Rank']
data_df = data_df.drop(columns=['Breed', '2020 Rank'])
data_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

def get_score(random_state: int) -> float:
    X_train, X_test, y_train, y_test = train_test_split(data_df.drop(columns=['popularity']), data_df['popularity'],
                                                test_size=0.1, random_state=random_state, shuffle=True)
    model = RandomForestRegressor(n_estimators=len(X_test), random_state=random_state)
    model.fit(X=X_train, y=y_train)
    return mean_absolute_error(y_true=y_test, y_pred=model.predict(X=X_test))

scores = [get_score(random_state=state) for state in range(200)]
pd.DataFrame(data={'score': scores}).plot()

Yeah regression isn't really suited for ranking, so it's not surprising we get essentially random results. And also there's no accounting for taste. We would expect popularity to be serially correlated, and we've ignored the prior years' rankings in our model altogether.