In [3]:
import pandas as pd

sre = pd.read_csv(r"C:\Users\luke0\Downloads\diffbot-export.csv")

sre["title"] = ["Site Reliability Engineer"] * len (sre)

df = pd.concat([sre]).reset_index(drop=True)
df = df[~df.text.isna()]

In [None]:
from yarl import URL

def get_page_source(url: str):
    return URL(url).host

df["page_host"] = df["pageUrl"].apply(get_page_source)

In [None]:
import plotly.express as px

top_pages = df["page_host"].value_counts()[:20].to_frame()

fig = px.histogram(
    top_pages,
    x=top_pages.index,
    y="page_host",
    labels={"sum of page_host": "frequency", "index": "page host"},
).update_xaxes(
    categoryorder="total descending",
)

fig

In [None]:
import texthero as hero

# Clean text
df["text"] = df["text"].pipe(hero.clean)

# Turn a list of text into a string
text = " ".join(df["text"].values)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

%matplotlib inline


def make_wordcloud(new_text):
    """'function to make wordcloud"""

    wordcloud = WordCloud(
        width=800,
        height=800,
        min_font_size=10,
        background_color="black",
        colormap="Set2",
        collocation_threshold=3,
    ).generate(new_text)

    fig = plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)

    plt.show()

    return fig

text_cloud = make_wordcloud(text)

In [None]:
import scattertext as st

analyze_col = "requirements"

# Filter out the rows whose requirement is nan
filtered_df = df[~df[analyze_col].isna()][["title", analyze_col, "page_host"]]

# Tokenize text
filtered_df["parse"] = filtered_df[analyze_col].apply(st.whitespace_nlp_with_sentences)

corpus = (
    st.CorpusFromParsedDocuments(filtered_df, category_col="title", parsed_col="parse")
    .build()
    .get_unigram_corpus()
    .compact(st.AssociationCompactor(2000))
)

In [None]:
import nltk
nltk.download("averaged_perceptron_tagger")

# get DataFrame with terms and their frequency
term_freq_df = corpus.get_term_freq_df()

# Get scaled F-scores of each term in each category
term_freq_df["SRE"] = corpus.get_scaled_f_scores("data scientist")
term_freq_df["Data Engineer Score"] = corpus.get_scaled_f_scores("data engineer")

# Remove terms that are not nouns
def is_noun(word: str):
    pos = nltk.pos_tag([word])[0][1]
    return pos[:2] == "NN"

term_freq_df = term_freq_df.loc[map(is_noun, term_freq_df.index)]

In [None]:
term_freq_df.sort_values(by="Data Scientist Score", ascending=False).index[:30]