### Importing Libraries

In [23]:
import pandas as pd
from collections import Counter
import ast
import kaleido
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
from io import StringIO
from html.parser import HTMLParser
import concurrent.futures

### Importing Data

In [24]:
train_final = pd.read_csv('data/train_final.csv')
train_final['cell_type_shuffled'] = train_final['cell_type_shuffled'].apply(ast.literal_eval)
train_final['code_markdown_shuffled'] = train_final['code_markdown_shuffled'].apply(ast.literal_eval)
train_final['cell_order_shuffled'] = train_final['cell_order_shuffled'].apply(ast.literal_eval)
train_final['cell_rank_shuffled'] = train_final['cell_rank_shuffled'].apply(ast.literal_eval)
train_final['cell_order'] = train_final['cell_order'].apply(ast.literal_eval)

test_final = pd.read_csv('data/test_final.csv')
test_final['cell_type_shuffled'] = test_final['cell_type_shuffled'].apply(ast.literal_eval)
test_final['code_markdown_shuffled'] = test_final['code_markdown_shuffled'].apply(ast.literal_eval)

In [25]:
train_final_exploded = train_final.explode(['cell_type_shuffled',
                                            'code_markdown_shuffled',
                                            'cell_order_shuffled', 
                                            'cell_rank_shuffled']).reset_index(drop=True)

In [51]:
train_final[train_final['id']=='0002115f48f982']['cell_type_shuffled'].values[0]

['code', 'code', 'code', 'code', 'code', 'code', 'code', 'code', 'markdown']

In [60]:
train_final[train_final['id']=='0002115f48f982']['code_markdown_shuffled'].values[0]

['import numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n%matplotlib inline\nimport os\nprint(os.listdir("../input"))\n',
 "df = pd.read_csv('../input/metadata_train.csv')\ndf.info()",
 'df.head()',
 "#let's check if targets are consistent within the same measurement id\ntargets = df.groupby('id_measurement')[['target','id_measurement']].agg('mean')\ntargets.head()",
 'sns.countplot(x=\'target\',data=targets)\n# it should be only "1" and "0" but we have cases where target is not consitent ',
 "mislabeled = targets.loc[(targets.target <1 ) & (targets.target > 0.3) ,'id_measurement']\nprint(str(mislabeled.shape[0]) + ' measurments most likely mislabeled' )\n",
 '# qc it all',
 '\ndf.loc[df.id_measurement.isin(mislabeled) ,:]',
 'Hi there,\n\nIs it ok that the same measurement have different target labels between signals?\nAccording to data description it should be the same (or no

In [62]:
train_final[train_final['id']=='0002115f48f982']['cell_rank_shuffled'].values[0]

[1, 2, 3, 4, 5, 6, 7, 8, 0]

In [56]:
train_final[train_final['id']=='0002115f48f982']['cell_order'].values[0]

['9ec225f0',
 '18281c6c',
 'e3b6b115',
 '4a044c54',
 '365fe576',
 'a3188e54',
 'b3f6e12d',
 'ee7655ca',
 '84125b7a']

In [59]:
train_final[train_final['id']=='0002115f48f982']['cell_order_shuffled'].values[0]

['18281c6c',
 'e3b6b115',
 '4a044c54',
 '365fe576',
 'a3188e54',
 'b3f6e12d',
 'ee7655ca',
 '84125b7a',
 '9ec225f0']

### Exploratory Data Analysis

In [None]:
top_n = 9

language_counts = train_final['markdown_language'].value_counts()

top_n_languages = language_counts[:top_n]
other_languages = language_counts[top_n:]

language_counts_grouped = pd.DataFrame(top_n_languages).reset_index()
language_counts_grouped.columns = ['markdown_language', 'counts']
language_counts_grouped['markdown_language'] = language_counts_grouped['markdown_language'].apply(lambda x: x.upper())
language_counts_grouped.loc[top_n+1] = ['Others', sum(other_languages.values)]


fig = px.pie(language_counts_grouped, 
             values='counts', 
             names='markdown_language',
             width=800, 
             height=1000,
             title='Markdown Language Ratios')

fig.update_traces(textinfo='percent',
                  marker=dict(line=dict(color='white', width=3)))

fig.update_traces()

fig.show('svg')

In [None]:
line_counts_codes_list = []
line_counts_markdowns_list = []

for r in range(train_final.shape[0]): 
    codes_line_list = ''.join(train_final['codes'][r]).split('\n')
    codes_line_list = [val for val in codes_line_list if val != '']
    line_counts_codes = len(codes_line_list)
    
    markdowns_line_list = ''.join(train_final['markdowns_cleaned'][r]).split('\n')
    markdowns_line_list = [val for val in markdowns_line_list if val != '']
    line_counts_markdowns = len(markdowns_line_list)
    
    line_counts_codes_list.append(line_counts_codes)
    line_counts_markdowns_list.append(line_counts_markdowns)
    
line_counts_codes_list = np.array(line_counts_codes_list)    
line_counts_markdowns_list = np.array(line_counts_markdowns_list)    

In [None]:
train_final['codes_line_counts'] = line_counts_codes_list
train_final['markdowns_line_counts'] = line_counts_markdowns_list

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=line_counts_codes_list, 
                     name = 'Number of Lines (Codes)',
                     marker_color = 'indianred'))

fig.add_trace(go.Box(y=line_counts_markdowns_list,
                     name = 'Number of Lines (Markdowns)',
                     marker_color = 'lightseagreen'))

fig.update_layout(template = 'plotly_white',
                  width=1000,
                  height=600,
                  title="Distribution of the Number of Lines")
    

fig.show('svg')

In [None]:
n = 1
n_line_markdown_ratio = np.mean(line_counts_markdowns_list<=n)
n_line_markdown_ratio = np.round(n_line_markdown_ratio, 2)

print("The ratio of notebooks that have lower than or equal to {} line(s) in their markdowns: {}".format(n, n_line_markdown_ratio))

In [None]:
x = list(range(1,26))
y = [np.mean(line_counts_markdowns_list<=n) for n in x]

fig = go.Figure([go.Bar(x=x, y=y)])

fig.update_traces(marker_color='rgb(158,202,225)', 
                  marker_line_color='white',
                  marker_line_width=0.50, opacity=1)

fig.update_layout(template='plotly_white',
                  width=1000,
                  height=600,
                  title="Cumulative Bar Chart",
                  xaxis_title="Number of Lines",
                  yaxis_title="Ratio of Notebooks")

fig.update_xaxes(showline=True, 
                 showgrid=True, 
                 gridwidth=1.5, 
                 linewidth=2, 
                 nticks = max(x)+1)

fig.update_yaxes(showline=True, 
                 showgrid=True, 
                 gridwidth=1.5, 
                 linewidth=2, 
                 nticks=11,
                 range = [0,1])

fig.show('svg')

In [None]:
line_counts_codes_sorted = pd.Series(line_counts_codes_list).sort_values(ascending=False)
long_codes_indices = line_counts_codes_sorted[line_counts_codes_sorted>100].index.tolist()

line_counts_markdowns_sorted = pd.Series(line_counts_markdowns_list).sort_values(ascending=False)
long_markdowns_indices = line_counts_markdowns_sorted[line_counts_markdowns_sorted>100].index.tolist()

In [None]:
long_markdown = train_final['markdowns_cleaned'][long_markdowns_indices[398]][:3]
long_markdown_joined = '\n'.join(long_markdown)

print(long_markdown_joined)