### Importing Libraries

In [1]:
import pandas as pd
from collections import Counter
import ast
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

### Importing Data

In [2]:
train_final = pd.read_csv('train_final.csv')

train_final['codes'] = train_final['codes'].apply(ast.literal_eval)
train_final['markdowns'] = train_final['markdowns'].apply(ast.literal_eval)
train_final['language'] = train_final['language'].apply(lambda x: x if pd.notnull(x) else 'N/A')

### Exploratory Data Analysis

In [6]:
top_n = 9

language_counts = train_final['language'].value_counts()

top_n_languages = language_counts[:top_n]
other_languages = language_counts[top_n:]

language_counts_grouped = pd.DataFrame(top_n_languages).reset_index()
language_counts_grouped.columns = ['language', 'counts']
language_counts_grouped['language'] = language_counts_grouped['language'].apply(lambda x: x.upper())
language_counts_grouped.loc[top_n+1] = ['Others', sum(other_languages.values)]


fig = px.pie(language_counts_grouped, 
             values='counts', names='language',
             title='Markdown Language Ratios')

fig.update_traces(textinfo='percent',
                  marker=dict(line=dict(color='white', width=3)))

fig.update_traces()

fig.show()

In [28]:
line_counts_codes_list = []
line_counts_markdowns_list = []

for r in range(train_final.shape[0]): 
    codes_line_list = ''.join(train_final['codes'][r]).split('\n')
    codes_line_list = [val for val in codes_line_list if val != '']
    line_counts_codes = len(codes_line_list)
    
    markdowns_line_list = ''.join(train_final['markdowns'][r]).split('\n')
    markdowns_line_list = [val for val in markdowns_line_list if val != '']
    line_counts_markdowns = len(markdowns_line_list)
    
    line_counts_codes_list.append(line_counts_codes)
    line_counts_markdowns_list.append(line_counts_markdowns)
    

line_counts_codes_list = np.array(line_counts_codes_list)    
line_counts_markdowns_list = np.array(line_counts_markdowns_list)    

In [29]:
fig = go.Figure()
fig.add_trace(go.Box(y=line_counts_codes_list, 
                     name = 'Number of Lines (Codes)',
                     marker_color = 'indianred'))

fig.add_trace(go.Box(y=line_counts_markdowns_list,
                     name = 'Number of Lines (Markdowns)',
                     marker_color = 'lightseagreen'))

fig.update_layout(template = 'plotly_white')

fig.show()

In [67]:
long_codes_indices = np.where(line_counts_codes_list>440)[0]
long_markdowns_indices = np.where(line_counts_markdowns_list>400)[0]

In [73]:
print(''.join(train_final['markdowns'][817])) 

<div style="color:white;
           display:fill;
           border-radius:15px;
           font-size:110%;
           font-family:cursive;
           letter-spacing:0.5px;
           background-color:#8fa2ff;
           color:Black;
           font-family:cursive;
            padding:5px 5px 5px 5px;
           ">
<h1 style="text-align:center;font-weight: bold">Data Preprocessing</h1></div>

<div style="color:white;
           display:fill;
           border-radius:15px;
           font-size:110%;
           font-family:cursive;
           letter-spacing:0.5px;
           background-color:#8fa2ff;
           color:Black;
           font-family:cursive;
            padding:5px 5px 5px 5px;
           ">
<h2 style="text-align:center;font-weight: bold">Correlation Plot</h2></div>

### Above plots shows the distribution of New Leagues among Different Salaries<div style="color:white;
           display:fill;
           border-radius:10px;
           font-size:110%;
           font-family:c

In [64]:
long_markdown = train_final['markdowns'][np.argmax(line_counts_markdowns_list)]

long_markdown_joined = ''.join(long_markdown)

print(long_markdown_joined)

# Training Model and Prediction
First, we will train a model based on preprocessed values of training data set.
Second, let's predict test values based on the trained model.target, 48df886f9, 0deb4b6a8, 34b15f335, a8cb14b00, 2f0771a37, 30347e683, d08d1fbe3, 6ee66e115, 20aa07010, dc5a8f1d8, 11d86fa6a, 77c9823f2, 8d6c2a0b2, 4681de4fd, adf119b9a, cff75dd09, 96f83a237, b8a716ebf, 6c7a4567c, 4fcfd2b4d, f3b9c0b95, 71cebf11c, d966ac62c, 68b647452, c88d108c9, ff7b471cd, 0d866c3d7, bc3f77679, bd8f989f1, 0eff5bf95, 22ed6dba3, 92b13ebba, 233c7c17c, 2cb4d123e, 87ffda550, 822e49b95, 316b978cd, d04e16aed, 5d5c5ce6d, ec863cb52, 11ad148bd, ea18d720e, 408d86ce9, 69d8b4020, b0868a049, 80b14398e, ecdef52b2, c36bcacc5, 151d318cd, ab0cddb90, 2d6bd8275, dfdf4b580, c87722678, a8c320153, a04684f1f, fa977f17b, feed9d437, 645b47cde, 7298ca1ef, d80abf8bc, 654dd8a3b, 7fc39583c, 8c94b6675, e421c414e, 964335fdf, 5a86cabd0, 6cb207ac9, c1bc828da, c4d38135f, 968b01ca3, 26b423c42, 0656586a4, b4ced4b7a, 0b8e10df6, 603e8

In [65]:
print(' '.join([val for val in long_markdown_joined.split(' ') if len(val)<20]))

# Training Model and Prediction
First, we will train a model based on preprocessed values of training data set.
Second, let's predict test values based on the trained model.target, 48df886f9, 0deb4b6a8, 34b15f335, a8cb14b00, 2f0771a37, 30347e683, d08d1fbe3, 6ee66e115, 20aa07010, dc5a8f1d8, 11d86fa6a, 77c9823f2, 8d6c2a0b2, 4681de4fd, adf119b9a, cff75dd09, 96f83a237, b8a716ebf, 6c7a4567c, 4fcfd2b4d, f3b9c0b95, 71cebf11c, d966ac62c, 68b647452, c88d108c9, ff7b471cd, 0d866c3d7, bc3f77679, bd8f989f1, 0eff5bf95, 22ed6dba3, 92b13ebba, 233c7c17c, 2cb4d123e, 87ffda550, 822e49b95, 316b978cd, d04e16aed, 5d5c5ce6d, ec863cb52, 11ad148bd, ea18d720e, 408d86ce9, 69d8b4020, b0868a049, 80b14398e, ecdef52b2, c36bcacc5, 151d318cd, ab0cddb90, 2d6bd8275, dfdf4b580, c87722678, a8c320153, a04684f1f, fa977f17b, feed9d437, 645b47cde, 7298ca1ef, d80abf8bc, 654dd8a3b, 7fc39583c, 8c94b6675, e421c414e, 964335fdf, 5a86cabd0, 6cb207ac9, c1bc828da, c4d38135f, 968b01ca3, 26b423c42, 0656586a4, b4ced4b7a, 0b8e10df6, 603e8

In [66]:
long_markdown_joined

'# Training Model and Prediction\nFirst, we will train a model based on preprocessed values of training data set.\nSecond, let\'s predict test values based on the trained model.target, 48df886f9, 0deb4b6a8, 34b15f335, a8cb14b00, 2f0771a37, 30347e683, d08d1fbe3, 6ee66e115, 20aa07010, dc5a8f1d8, 11d86fa6a, 77c9823f2, 8d6c2a0b2, 4681de4fd, adf119b9a, cff75dd09, 96f83a237, b8a716ebf, 6c7a4567c, 4fcfd2b4d, f3b9c0b95, 71cebf11c, d966ac62c, 68b647452, c88d108c9, ff7b471cd, 0d866c3d7, bc3f77679, bd8f989f1, 0eff5bf95, 22ed6dba3, 92b13ebba, 233c7c17c, 2cb4d123e, 87ffda550, 822e49b95, 316b978cd, d04e16aed, 5d5c5ce6d, ec863cb52, 11ad148bd, ea18d720e, 408d86ce9, 69d8b4020, b0868a049, 80b14398e, ecdef52b2, c36bcacc5, 151d318cd, ab0cddb90, 2d6bd8275, dfdf4b580, c87722678, a8c320153, a04684f1f, fa977f17b, feed9d437, 645b47cde, 7298ca1ef, d80abf8bc, 654dd8a3b, 7fc39583c, 8c94b6675, e421c414e, 964335fdf, 5a86cabd0, 6cb207ac9, c1bc828da, c4d38135f, 968b01ca3, 26b423c42, 0656586a4, b4ced4b7a, 0b8e10df6, 6

In [14]:
len(long_markdown)

13