### Importing Libraries

In [22]:
import pandas as pd
from collections import Counter
import ast
import kaleido
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
from io import StringIO
from html.parser import HTMLParser
import concurrent.futures

### Importing Data

In [24]:
train_final = pd.read_csv('data/train_final.csv', nrows = 1000)
train_final['cell_type_shuffled'] = train_final['cell_type_shuffled'].apply(ast.literal_eval)
train_final['code_markdown_shuffled'] = train_final['code_markdown_shuffled'].apply(ast.literal_eval)
train_final['cell_order_shuffled'] = train_final['cell_order_shuffled'].apply(ast.literal_eval)
train_final['cell_rank_shuffled'] = train_final['cell_rank_shuffled'].apply(ast.literal_eval)
train_final['cell_order'] = train_final['cell_order'].apply(ast.literal_eval)

test_final = pd.read_csv('data/test_final.csv')
test_final['cell_type_shuffled'] = test_final['cell_type_shuffled'].apply(ast.literal_eval)
test_final['code_markdown_shuffled'] = test_final['code_markdown_shuffled'].apply(ast.literal_eval)

In [25]:
train_final_exploded = train_final.explode(['cell_type_shuffled',
                                            'code_markdown_shuffled',
                                            'cell_order_shuffled', 
                                            'cell_rank_shuffled',
                                            'cell_order']).reset_index(drop=True)

In [28]:
train_final_exploded[train_final_exploded['id']=='00001756c60be8'].sort_values(['cell_rank_shuffled'])

Unnamed: 0,id,cell_type_shuffled,code_markdown_shuffled,cell_order_shuffled,cell_rank_shuffled,cell_order,ancestor_id,parent_id
0,00001756c60be8,code,# This Python 3 environment comes with many he...,1862f0a6,0,1862f0a6,945aea18,
37,00001756c60be8,markdown,Импортируем необходимые для работы функции и ...,448eb224,1,36002912,945aea18,
1,00001756c60be8,code,import numpy as np\nimport pandas as pd\nimpor...,2a9e43d6,2,448eb224,945aea18,
45,00001756c60be8,markdown,Подключаем предупреждения,7e2f170a,3,76512d50,945aea18,
2,00001756c60be8,code,import warnings\nwarnings.filterwarnings('igno...,038b763d,4,2a9e43d6,945aea18,
36,00001756c60be8,markdown,"Устанавливаем значения, чтобы везде был одина...",77e56113,5,23783525,945aea18,
3,00001756c60be8,code,matplotlib.rcParams.update({'font.size': 14}),2eefe0ef,6,7e2f170a,945aea18,
54,00001756c60be8,markdown,Задаем функцию для подсчета метрик,1ae087ab,7,5bf9ca51,945aea18,
4,00001756c60be8,code,"def evaluate_preds(train_true_values, train_pr...",0beab1cd,8,038b763d,945aea18,
49,00001756c60be8,markdown,Указываем путь к файлам с данными,8ffe0b25,9,8554b284,945aea18,


### Exploratory Data Analysis

In [None]:
top_n = 9

language_counts = train_final['markdown_language'].value_counts()

top_n_languages = language_counts[:top_n]
other_languages = language_counts[top_n:]

language_counts_grouped = pd.DataFrame(top_n_languages).reset_index()
language_counts_grouped.columns = ['markdown_language', 'counts']
language_counts_grouped['markdown_language'] = language_counts_grouped['markdown_language'].apply(lambda x: x.upper())
language_counts_grouped.loc[top_n+1] = ['Others', sum(other_languages.values)]


fig = px.pie(language_counts_grouped, 
             values='counts', 
             names='markdown_language',
             width=800, 
             height=1000,
             title='Markdown Language Ratios')

fig.update_traces(textinfo='percent',
                  marker=dict(line=dict(color='white', width=3)))

fig.update_traces()

fig.show('svg')

In [None]:
line_counts_codes_list = []
line_counts_markdowns_list = []

for r in range(train_final.shape[0]): 
    codes_line_list = ''.join(train_final['codes'][r]).split('\n')
    codes_line_list = [val for val in codes_line_list if val != '']
    line_counts_codes = len(codes_line_list)
    
    markdowns_line_list = ''.join(train_final['markdowns_cleaned'][r]).split('\n')
    markdowns_line_list = [val for val in markdowns_line_list if val != '']
    line_counts_markdowns = len(markdowns_line_list)
    
    line_counts_codes_list.append(line_counts_codes)
    line_counts_markdowns_list.append(line_counts_markdowns)
    
line_counts_codes_list = np.array(line_counts_codes_list)    
line_counts_markdowns_list = np.array(line_counts_markdowns_list)    

In [None]:
train_final['codes_line_counts'] = line_counts_codes_list
train_final['markdowns_line_counts'] = line_counts_markdowns_list

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=line_counts_codes_list, 
                     name = 'Number of Lines (Codes)',
                     marker_color = 'indianred'))

fig.add_trace(go.Box(y=line_counts_markdowns_list,
                     name = 'Number of Lines (Markdowns)',
                     marker_color = 'lightseagreen'))

fig.update_layout(template = 'plotly_white',
                  width=1000,
                  height=600,
                  title="Distribution of the Number of Lines")
    

fig.show('svg')

In [None]:
n = 1
n_line_markdown_ratio = np.mean(line_counts_markdowns_list<=n)
n_line_markdown_ratio = np.round(n_line_markdown_ratio, 2)

print("The ratio of notebooks that have lower than or equal to {} line(s) in their markdowns: {}".format(n, n_line_markdown_ratio))

In [None]:
x = list(range(1,26))
y = [np.mean(line_counts_markdowns_list<=n) for n in x]

fig = go.Figure([go.Bar(x=x, y=y)])

fig.update_traces(marker_color='rgb(158,202,225)', 
                  marker_line_color='white',
                  marker_line_width=0.50, opacity=1)

fig.update_layout(template='plotly_white',
                  width=1000,
                  height=600,
                  title="Cumulative Bar Chart",
                  xaxis_title="Number of Lines",
                  yaxis_title="Ratio of Notebooks")

fig.update_xaxes(showline=True, 
                 showgrid=True, 
                 gridwidth=1.5, 
                 linewidth=2, 
                 nticks = max(x)+1)

fig.update_yaxes(showline=True, 
                 showgrid=True, 
                 gridwidth=1.5, 
                 linewidth=2, 
                 nticks=11,
                 range = [0,1])

fig.show('svg')

In [None]:
line_counts_codes_sorted = pd.Series(line_counts_codes_list).sort_values(ascending=False)
long_codes_indices = line_counts_codes_sorted[line_counts_codes_sorted>100].index.tolist()

line_counts_markdowns_sorted = pd.Series(line_counts_markdowns_list).sort_values(ascending=False)
long_markdowns_indices = line_counts_markdowns_sorted[line_counts_markdowns_sorted>100].index.tolist()

In [None]:
long_markdown = train_final['markdowns_cleaned'][long_markdowns_indices[398]][:3]
long_markdown_joined = '\n'.join(long_markdown)

print(long_markdown_joined)