In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
import pandas as pd
import os, sys
from statistics import mean
import plotly.express as px
import plotly.graph_objects as go

In [3]:
DATA_DIR = 'data/all_questions_data.csv'

data = pd.read_csv(DATA_DIR)

In [4]:
data.sort_values(by=['Question ID'], ignore_index=True, inplace=True)

In [5]:
data.head()

Unnamed: 0,Question Title,Question Slug,Question ID,Question Text,Topic Tagged text,Topic Tagged ID,Difficulty Level,Success Rate,total submission,total accepted,company tag,Likes,Dislikes,Hints,Similar Questions ID,Similar Questions Text
0,Two Sum,two-sum,1,Given an array of integers nums and an integer...,"Array,Hash Table","VG9waWNUYWdOb2RlOjU=,VG9waWNUYWdOb2RlOjY=",Easy,48.5,13207990,6403821,,31242,988,A really brute force way would be to search fo...,"15,18,167,170,560,653,1083,1798,1830,2116,2133...","3Sum,4Sum,Two Sum II - Input Array Is Sorted,T..."
1,Add Two Numbers,add-two-numbers,2,You are given two non-empty linked lists repre...,"Linked List,Math,Recursion","VG9waWNUYWdOb2RlOjc=,VG9waWNUYWdOb2RlOjg=,VG9w...",Medium,38.5,6987977,2690949,,17799,3682,,436737141544510311774,"Multiply Strings,Add Binary,Sum of Two Integer..."
2,Longest Substring Without Repeating Characters,longest-substring-without-repeating-characters,3,"Given a string s, find the length of the longe...","Hash Table,String,Sliding Window","VG9waWNUYWdOb2RlOjY=,VG9waWNUYWdOb2RlOjEw,VG9w...",Medium,33.0,9621884,3175843,,22941,1027,,159340103418132209,Longest Substring with At Most Two Distinct Ch...
3,Median of Two Sorted Arrays,median-of-two-sorted-arrays,4,Given two sorted arrays nums1 and nums2 of siz...,"Array,Binary Search,Divide and Conquer","VG9waWNUYWdOb2RlOjU=,VG9waWNUYWdOb2RlOjEx,VG9w...",Hard,34.0,3941694,1340565,,15987,1964,,,
4,Longest Palindromic Substring,longest-palindromic-substring,5,"Given a string s, return the longest palindrom...","String,Dynamic Programming","VG9waWNUYWdOb2RlOjEw,VG9waWNUYWdOb2RlOjEz",Medium,31.8,5618701,1784028,,17097,1005,How can we reuse a previously computed palindr...,214266336516647,"Shortest Palindrome,Palindrome Permutation,Pal..."


## For each difficulty level

In [7]:
diff_level = data['Difficulty Level'].value_counts()
dl_df = pd.DataFrame(diff_level).reset_index()
dl_df.columns = ['Difficulty Level', 'Total Questions']

fig = px.bar(dl_df, y='Difficulty Level', x='Total Questions',  orientation = 'h', title="Number of questions for each difficulty level")
fig.show()

In [8]:
# Generate questions/topics stats
topics_count = {}
topics_dl_count = {}
topics_l_dl_count = {}
topics_acc = {}
topics_subs = {}

for _, que in data[['Topic Tagged text', 'Difficulty Level', 'Likes', 'Dislikes', 'Success Rate', 'total submission']].iterrows():
    
    if type(que['Topic Tagged text']) != str:
        continue
        
    topics_lst = que['Topic Tagged text'].split(',')
    
    for topic in topics_lst:
        # Topic count
        if topic not in list(topics_count.keys()):
            topics_count[topic] = 1
        else:
            topics_count[topic] = topics_count[topic] + 1
            
        # Topic with difficulty level
        if topic not in list(topics_dl_count.keys()):
            topics_dl_count[topic] = {'Easy':1, 'Medium':1, 'Hard':1}
        else:
            d_level = que['Difficulty Level']
            topics_dl_count[topic][d_level] = topics_dl_count[topic][d_level] + 1
            
        # Likes/Dislikes
        likes = que['Likes']
        dislikes = que['Dislikes']
        if topic not in list(topics_l_dl_count.keys()):
            topics_l_dl_count[topic] = {'Likes': likes, 'Dislikes':dislikes}
        else:
            topics_l_dl_count[topic]['Likes'] = topics_l_dl_count[topic]['Likes'] + likes
            topics_l_dl_count[topic]['Dislikes'] = topics_l_dl_count[topic]['Dislikes'] + dislikes
            
        # Topics accuracy
        acc = que['Success Rate']
        if topic not in list(topics_acc.keys()):
            topics_acc[topic] = [acc]
        else:
            topics_acc[topic].append(acc)
            
        # Topics Submission
        subs = que['total submission']
        if topic not in list(topics_subs.keys()):
            topics_subs[topic] = subs
        else:
            topics_subs[topic] = topics_subs[topic] + subs


### 1. Number of Questions from each topic

In [9]:
tc_df = pd.DataFrame(pd.Series(topics_count)).reset_index()
tc_df.columns = ['Topics', 'Total Questions']

fig = px.bar(tc_df, x='Topics', y='Total Questions', text_auto='.2s', title="Number of questions in each topics", color="Total Questions")
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

### 1.1 all topics with each difficulty level

In [10]:
fig = px.bar(pd.DataFrame(topics_dl_count).transpose(), title="Total Questions with each topic with each difficulty level")
fig.show()

### 2. Most Liked/Dislikes Questions

In [11]:
liked_que = data.sort_values(by=['Likes'], ascending=False).reset_index().head(1)
disliked_que = data.sort_values(by=['Dislikes'], ascending=False).reset_index().head(1)

In [12]:
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Question</b>', '<b>Likes/Dislikes</b>', '<b>Count</b>']),
                 cells=dict(values=[[liked_que['Question Title'], disliked_que['Question Title']], 
                                    ['Likes', 'Dislikes'], [liked_que['Likes'], disliked_que['Likes']]]))
                     ])
fig.update_layout(title="Most Liked/Disliked Questions", height=250)
fig.show()

#### 2.1 Most Liked/Disliked Topics

In [13]:
l_dl_topics_df = pd.DataFrame(topics_l_dl_count).transpose().reset_index()

In [14]:
liked_topic = l_dl_topics_df.sort_values(by=['Likes'], ascending=False).head(1)
disliked_topic = l_dl_topics_df.sort_values(by=['Dislikes'], ascending=False).head(1)

In [15]:
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Topic</b>', '<b>Likes</b>', '<b>Dislikes</b>']),
                 cells=dict(values=[liked_topic['index'], liked_topic['Likes'], liked_topic['Dislikes']
                                    ]))
                     ])
fig.update_layout(title="Most Liked/Disliked Topic", height=250)
fig.show()

## 3. Topic with Low/High success rate

In [16]:
mean_topic_acc = {}
for topic, acc_list in topics_acc.items():
    mean_topic_acc[topic] = round(mean(acc_list), 2)
    
mean_acc_df = pd.DataFrame(pd.Series(mean_topic_acc)).reset_index()
mean_acc_df.columns = ['Topics', 'Mean Acc']
mean_acc_df.sort_values(by=['Mean Acc'], inplace=True)

In [17]:
min_max = mean_acc_df.head(1)
min_max = min_max.append(mean_acc_df.tail(1)).reset_index(drop=True)
min_max

Unnamed: 0,Topics,Mean Acc
0,Shell,27.32
1,Database,66.3


In [18]:
fig = px.scatter(min_max,
                     x="Topics", y="Mean Acc", size="Mean Acc", color="Topics", text="Mean Acc",
                      size_max = 100,
                     template="plotly_white", title="Topics with average minimun and maximun accuracy")
fig.update_layout(width=600, height=600)
fig.show()

## 5. Most/Least famous questions to solve

In [19]:
# 'Total submission' is the indicator of how much the question is famous

most_fam_que = data.sort_values(by=['total submission'], ascending=False).head(1)
least_fam_que = data.sort_values(by=['total submission'], ascending=False).tail(1)

In [20]:
least_fam_que

Unnamed: 0,Question Title,Question Slug,Question ID,Question Text,Topic Tagged text,Topic Tagged ID,Difficulty Level,Success Rate,total submission,total accepted,company tag,Likes,Dislikes,Hints,Similar Questions ID,Similar Questions Text
2234,Number of Times a Driver Was a Passenger,number-of-times-a-driver-was-a-passenger,2376,,,,Medium,82.3,186,153,,6,1,,177917851795,"Hopper Company Queries I,Hopper Company Querie..."


In [21]:
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Question</b>', '<b>Most/Least Famous</b>', '<b>Total Submission</b>']),
                 cells=dict(values=[[most_fam_que['Question Title'], least_fam_que['Question Title']], 
                                    ['Most Famous', 'Least Famous'], [most_fam_que['total submission'], least_fam_que['total submission']]]))
                     ])
fig.update_layout(title="Most/Least famous Questions", height=260)
fig.show()

### 5.1 Most/Least famous Topics

In [22]:
subs_df = pd.DataFrame.from_dict(pd.Series(topics_subs)).reset_index()
subs_df.columns = ['Topics', 'total submission']
subs_df.sort_values(by=['total submission'], ascending=False, inplace=True)

In [23]:
most_fam_topic = subs_df.head(1)
least_fam_topic = subs_df.tail(1)

In [24]:
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Topics</b>', '<b>Most/Least Famous</b>']),
                 cells=dict(values=[[most_fam_topic['Topics'], least_fam_topic['Topics']], 
                                    ['Most Famous', 'Least Famous']]))
                     ])
fig.update_layout(title="Most/Least famous Topics", height=260)
fig.show()

## Top leading topics on Leetcode

In [25]:
top5_topics = tc_df[tc_df['Total Questions'] > 350]
top5_subs = subs_df.head(5)
top5_l_dl = l_dl_topics_df.sort_values(by=['Likes'], ascending=False).head(5)


leading = tuple( set(top5_topics['Topics']) & set(top5_subs['Topics']) & set(top5_l_dl['index']))

In [26]:
leading_df = pd.DataFrame(topics_dl_count)[list(leading)].transpose().sort_values(by=['Medium'], ascending=False)
fig = px.bar(leading_df, title="Top leading topics on Leetcode")
#y axis    
fig.update_yaxes(visible=False)

fig.show()