In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.traversal.depth_first_search import dfs_tree

In [None]:
!sudo apt-get install -y graphviz graphviz-dev
!pip install pygraphviz

In [None]:
df = pd.read_csv('../input/otto-recommender-kernel-statistics/otto-recommender-system_kernels.csv')
df.head()

# Top-20 kernel creators 

In [None]:
df['KernelAuthor'].value_counts().head(20)

# Perfomance Tiers vs kernel creators 

In [None]:
df['KernelAuthorPerformanceTier'].value_counts()

# Kernels medals 

In [None]:
df['medal'].value_counts()

# Medals for kernels creators (top-20)

In [None]:
df.groupby('KernelAuthor')['medal'].value_counts()\
                .unstack()[['GOLD', 'SILVER', 'BRONZE']]\
                .fillna(0).sort_values(['GOLD', 'SILVER', 'BRONZE'], ascending = False).head(20)

# Kernels vs Language 

In [None]:
df['languageName'].value_counts()

# How many kernels are the forks? 

In [None]:
df['isFork'].fillna(0).mean()

# How many lines are changed in forks?

In [None]:
df['ForkLinesChanged'].describe()

# Use GPU or not? 

In [None]:
df['isGpuEnabled'].fillna(0).mean()

# Upvotes vs views 

In [None]:
plt.figure(figsize = (10, 10))
plt.scatter(df['totalVotes'].values, df['totalViews'].values)
plt.xlabel('totalVotes')
plt.ylabel('totalViews')
plt.grid(True)
plt.show();

In [None]:
plt.figure(figsize = (10, 10))
plt.scatter(df['totalVotes'].values, df['bestPublicScore'].values)
plt.xlabel('totalVotes')
plt.ylabel('bestPublicScore')
plt.ylim([0.45, 0.6])
plt.grid(True)
plt.show();

# Most commented kernels 

In [None]:
df[['title', 'totalComments']].sort_values('totalComments', ascending = False).head(20).reset_index(drop = True)

# Fork graph

In [None]:
forks = df[['title', 'totalVotes', 'id', 'forkParent']].values
votes_dict = {i:v for i,v in zip(forks[:, 2], forks[:, 1])}

In [None]:
G = nx.DiGraph()
for title, votes, k_id, parent_id in forks:
    G.add_node(k_id, title = title, votes = votes)
    
for title, votes, k_id, parent_id in forks:
    if parent_id != -1:
        G.add_edge(parent_id, k_id)

In [None]:
connected_components = []
for cc in nx.connected_components(G.to_undirected()):
    if len(cc) == 1:
        G.remove_node(list(cc)[0])
    else:
        connected_components.append(cc)

In [None]:
plt.figure(figsize = (30, 8))
pos = nx.nx_agraph.graphviz_layout(G, prog="dot")
nx.draw(G, pos = pos, node_size=10000)
text = nx.draw_networkx_labels(G, pos)
for _, t in text.items():
    t.set_rotation('vertical')
    t.set_color('white')
plt.show()

In [None]:
nodes_subtree_size = {}
nodes_cum_votes = {}
for node in G.nodes():
    subtree = dfs_tree(G, node)
    nodes_subtree_size[node] = len(subtree.nodes()) - 1
    nodes_cum_votes[node] = sum([votes_dict[n] if n in votes_dict else 0 for n in subtree.nodes()])
        

In [None]:
forks = df[['title', 'KernelAuthor', 'id', 'forkParent']]
forks['TotalForkedKernels'] = forks['id'].map(nodes_subtree_size).fillna(0)
forks['CumulatedVotes'] = forks['id'].map(nodes_cum_votes).fillna(0)

# Most forks kernels (top-20)

In [None]:
forks.sort_values('TotalForkedKernels', ascending = False).reset_index(drop = True).head(20)

# Most cumulative votes kernels (top-20) 

In [None]:
forks.sort_values('CumulatedVotes', ascending = False).reset_index(drop = True).head(20)