### Installing Libraries

In [1]:
pip install googletrans==4.0.0rc1

Note: you may need to restart the kernel to use updated packages.


### Importing Libraries

In [2]:
import pandas as pd
from collections import Counter
import ast
import kaleido
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
from io import StringIO
from html.parser import HTMLParser
import concurrent.futures
from utils import translate
from multiprocessing import Pool

Using state Massachusetts server backend.


### Importing Data

In [11]:
train_final = pd.read_csv('train_final.csv')
train_final['cell_types_shuffled'] = train_final['cell_types_shuffled'].apply(ast.literal_eval)
train_final['code_markdowns_shuffled'] = train_final['code_markdowns_shuffled'].apply(ast.literal_eval)

test_final = pd.read_csv('test_final.csv')
test_final['cell_types_shuffled'] = test_final['cell_types_shuffled'].apply(ast.literal_eval)
test_final['code_markdowns_shuffled'] = test_final['code_markdowns_shuffled'].apply(ast.literal_eval)

In [106]:
train_final_sample = train_final.sample(1000).reset_index(drop=True)

In [112]:
train_final_exploded = train_final.explode(['cell_types_shuffled',
                                                   'code_markdowns_shuffled']).reset_index(drop=True)

train_final_exploded['code_markdowns_shuffled_translated'] = train_final_exploded['code_markdowns_shuffled'].copy()

nonenglish_markdowns = list(train_final_exploded.query('cell_types_shuffled == "markdown" & markdown_language != "en"')['code_markdowns_shuffled'])

In [120]:
train_final_exploded.query('markdown_language != "en"')

Unnamed: 0,id,cell_types_shuffled,code_markdowns_shuffled,cell_shuffled,cell_order,markdown_language,ancestor_id,parent_id,code_markdowns_shuffled_translated
0,00001756c60be8,code,# This Python 3 environment comes with many he...,1862f0a6 2a9e43d6 038b763d 2eefe0ef 0beab1cd 9...,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...,ru,945aea18,,# This Python 3 environment comes with many he...
1,00001756c60be8,code,import numpy as np\nimport pandas as pd\nimpor...,1862f0a6 2a9e43d6 038b763d 2eefe0ef 0beab1cd 9...,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...,ru,945aea18,,import numpy as np\nimport pandas as pd\nimpor...
2,00001756c60be8,code,import warnings\nwarnings.filterwarnings('igno...,1862f0a6 2a9e43d6 038b763d 2eefe0ef 0beab1cd 9...,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...,ru,945aea18,,import warnings\nwarnings.filterwarnings('igno...
3,00001756c60be8,code,matplotlib.rcParams.update({'font.size': 14}),1862f0a6 2a9e43d6 038b763d 2eefe0ef 0beab1cd 9...,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...,ru,945aea18,,matplotlib.rcParams.update({'font.size': 14})
4,00001756c60be8,code,"def evaluate_preds(train_true_values, train_pr...",1862f0a6 2a9e43d6 038b763d 2eefe0ef 0beab1cd 9...,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...,ru,945aea18,,"def evaluate_preds(train_true_values, train_pr..."
...,...,...,...,...,...,...,...,...,...
6370507,fffc3b44869198,code,test['bookID'],978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,et,a6aaa8d7,,test['bookID']
6370508,fffc3b44869198,code,"df = pd.DataFrame(np.nan, index=[0,1,2,3], col...",978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,et,a6aaa8d7,,"df = pd.DataFrame(np.nan, index=[0,1,2,3], col..."
6370509,fffc3b44869198,code,df,978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,et,a6aaa8d7,,df
6370510,fffc3b44869198,code,"df.to_csv('file_name.csv', index=False)",978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...,et,a6aaa8d7,,"df.to_csv('file_name.csv', index=False)"


In [111]:
%%time

print(translator.translate(sample_nonenglish_markdown[:1000], dest='en').text)

Exploratory and data analysis:
 Form analysis:
 Identification of the Target: The Target is already identified. It is made up of two values ​​0 for non -presence and 1 for risk. The Target is almost balanced with 46 non -risk of heart stop and 54 risk of heart stop
 Number of lines and columns: We have 14 variables in our dataset and 303 patients diagnosed. The Target is the 14th variable
 Types of variables: Dataset is already treated for categorical variables (encoding of category variables). Category variables are all the same identifiable, these are variables: sex, CP, FBS, remaining, exang, slope, ca, thal. The other variables are discreet: AGE, Trestbps, Chol, Thalach and a continuous variable: Oldpeak
 Identification of missing values: no missing data

 Basic analysis:
 Target relationship and continuous quantitative variables: Oldpeak variable (st depression at the level of the electrocar
CPU times: user 32 ms, sys: 3.25 ms, total: 35.3 ms
Wall time: 660 ms


In [55]:
%%time
if __name__ == '__main__':
    with Pool(8) as p:
        translated_markdowns = p.map(translate, nonenglish_markdowns)

Using state Massachusetts server backend.
Using state Massachusetts server backend.
Using state Massachusetts server backend.
Using state Massachusetts server backend.
Using state Massachusetts server backend.
Using state Massachusetts server backend.
Using state Massachusetts server backend.
Using state Massachusetts server backend.


CPU times: user 59.4 ms, sys: 243 ms, total: 303 ms
Wall time: 3min 35s


In [60]:
train_final_exploded.loc[train_final_exploded.query('cell_types_shuffled == "markdown" & markdown_language != "en"').index,
                         'code_markdowns_shuffled_translated'] = translated_markdowns

In [63]:
train_final_exploded.query('cell_types_shuffled == "markdown" & markdown_language != "en"').sample()

Unnamed: 0,id,cell_types_shuffled,code_markdowns_shuffled,cell_shuffled,cell_order,markdown_language,ancestor_id,parent_id,code_markdowns_shuffled_translated
1718,30d7c3aba9dab5,markdown,Определяем константы\n\n\n\n Локус варианты в...,dff6ca6f 3fe8e7af 6e32d24f b013be74 ce20afa6 4...,dff6ca6f 3fe8e7af 6e32d24f b013be74 1b01e864 c...,ru,9b72661b,,We determine the constants \n\n\n\n Locus Opt...


### Exploratory Data Analysis

In [None]:
top_n = 9

language_counts = train_final['markdown_language'].value_counts()

top_n_languages = language_counts[:top_n]
other_languages = language_counts[top_n:]

language_counts_grouped = pd.DataFrame(top_n_languages).reset_index()
language_counts_grouped.columns = ['markdown_language', 'counts']
language_counts_grouped['markdown_language'] = language_counts_grouped['markdown_language'].apply(lambda x: x.upper())
language_counts_grouped.loc[top_n+1] = ['Others', sum(other_languages.values)]


fig = px.pie(language_counts_grouped, 
             values='counts', 
             names='markdown_language',
             width=800, 
             height=1000,
             title='Markdown Language Ratios')

fig.update_traces(textinfo='percent',
                  marker=dict(line=dict(color='white', width=3)))

fig.update_traces()

fig.show('svg')

In [None]:
## ADD WIDGETS TO SHOW NOTEBOOKS BEFORE AND AFTER TRANSLATION

In [None]:
line_counts_codes_list = []
line_counts_markdowns_list = []

for r in range(train_final.shape[0]): 
    codes_line_list = ''.join(train_final['codes'][r]).split('\n')
    codes_line_list = [val for val in codes_line_list if val != '']
    line_counts_codes = len(codes_line_list)
    
    markdowns_line_list = ''.join(train_final['markdowns_cleaned'][r]).split('\n')
    markdowns_line_list = [val for val in markdowns_line_list if val != '']
    line_counts_markdowns = len(markdowns_line_list)
    
    line_counts_codes_list.append(line_counts_codes)
    line_counts_markdowns_list.append(line_counts_markdowns)
    
line_counts_codes_list = np.array(line_counts_codes_list)    
line_counts_markdowns_list = np.array(line_counts_markdowns_list)    

In [None]:
train_final['codes_line_counts'] = line_counts_codes_list
train_final['markdowns_line_counts'] = line_counts_markdowns_list

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=line_counts_codes_list, 
                     name = 'Number of Lines (Codes)',
                     marker_color = 'indianred'))

fig.add_trace(go.Box(y=line_counts_markdowns_list,
                     name = 'Number of Lines (Markdowns)',
                     marker_color = 'lightseagreen'))

fig.update_layout(template = 'plotly_white',
                  width=1000,
                  height=600,
                  title="Distribution of the Number of Lines")
    

fig.show('svg')

In [None]:
n = 1
n_line_markdown_ratio = np.mean(line_counts_markdowns_list<=n)
n_line_markdown_ratio = np.round(n_line_markdown_ratio, 2)

print("The ratio of notebooks that have lower than or equal to {} line(s) in their markdowns: {}".format(n, n_line_markdown_ratio))

In [None]:
x = list(range(1,26))
y = [np.mean(line_counts_markdowns_list<=n) for n in x]

fig = go.Figure([go.Bar(x=x, y=y)])

fig.update_traces(marker_color='rgb(158,202,225)', 
                  marker_line_color='white',
                  marker_line_width=0.50, opacity=1)

fig.update_layout(template='plotly_white',
                  width=1000,
                  height=600,
                  title="Cumulative Bar Chart",
                  xaxis_title="Number of Lines",
                  yaxis_title="Ratio of Notebooks")

fig.update_xaxes(showline=True, 
                 showgrid=True, 
                 gridwidth=1.5, 
                 linewidth=2, 
                 nticks = max(x)+1)

fig.update_yaxes(showline=True, 
                 showgrid=True, 
                 gridwidth=1.5, 
                 linewidth=2, 
                 nticks=11,
                 range = [0,1])

fig.show('svg')

In [None]:
line_counts_codes_sorted = pd.Series(line_counts_codes_list).sort_values(ascending=False)
long_codes_indices = line_counts_codes_sorted[line_counts_codes_sorted>100].index.tolist()

line_counts_markdowns_sorted = pd.Series(line_counts_markdowns_list).sort_values(ascending=False)
long_markdowns_indices = line_counts_markdowns_sorted[line_counts_markdowns_sorted>100].index.tolist()

In [None]:
long_markdown = train_final['markdowns_cleaned'][long_markdowns_indices[398]][:3]
long_markdown_joined = '\n'.join(long_markdown)

print(long_markdown_joined)

In [None]:
def translate(texts):
    translated_texts = [ts.google(val) for val in texts]
    return translated_texts

In [None]:
# nonenglish_index = train_final['language']!='en'

In [None]:
# train_final['markdowns_translated'] = None

In [None]:
# train_final.loc[~nonenglish_index, 'markdowns_translated'] = train_final['markdowns_cleaned'][~nonenglish_index].values

In [None]:
# from tqdm import tqdm
# tqdm.pandas()
# train_final.loc[nonenglish_index, 'markdowns_translated'] = train_final['markdowns_cleaned'][nonenglish_index].progress_apply(translate).values

In [None]:
for val in range(len(markdowns)):
    print(markdowns[val])
    print()
    print('---------')
    print()
    print(translated_markdowns[val])
    print()
    print('#########')
    print()

In [None]:
# ANCESTRY (?)