In [2]:
import pandas as pd
import numpy as np
import re
import time

# Import SnowballStemmer to remove plural words
from nltk.stem.snowball import SnowballStemmer

# Import translation API from our function dictionary
from dictionary import translate

In [3]:
stemmer = SnowballStemmer("english")

In [4]:
# Prepare all raw data
raw = pd.read_csv('raw_data.csv') # Contains all text crawl from data science website
simple_words = pd.read_csv('simple_words.csv') # Simple words which used to clean raw data

In [5]:
# Prepare dataframe of simple words for filter later
simple_words = simple_words.head(3000) # Only need 3000 most popular words as simple words
simple_words.dropna(inplace = True)
simple_words["word"] = simple_words["word"].apply(lambda x: x.upper())
simple_words["singular"] = simple_words["word"].apply(lambda x: stemmer.stem(x).upper()) # Remove plural from word => become singular. So we can use it to clean raw data
simple_words

Unnamed: 0,word,count,singular
0,THE,23135851162,THE
1,OF,13151942776,OF
2,AND,12997637966,AND
3,TO,12136980858,TO
4,A,9081174698,A
...,...,...,...
2995,SEXO,25849932,SEXO
2996,AP,25843002,AP
2997,PATTERNS,25809816,PATTERN
2998,BOXES,25787744,BOX


In [6]:
paragraphs = pd.Series(raw["paragraph"]) # All the paragraphs from raw data. This will be used to extract sentence contains data science words.
paragraphs

0       There are many facets to working in Data Scien...
1       Data preparation is particularly important. Da...
2       I then joined PageGroup. There, I worked as an...
3       Now I am a lead big data and machine learning ...
4       JustGiving is still a start-up at heart, so th...
                              ...                        
5395    In the comics, Dr. Stephen Strange, MD, become...
5396    Black boxes are his first choice for everythin...
5397    Vision is an android created by. His role as a...
5398    The real important point, here, is that Vision...
5399    J.A.R.V.I.S. They have decided to test the wat...
Name: paragraph, Length: 5400, dtype: object

In [7]:
# function that filter word with length > 3 and not in simple words
def filter_word(w):
    stemmed_word = stemmer.stem(w).upper()
    
    if len(w) > 3 and w.upper() not in simple_words["word"].values and w.upper() not in simple_words["singular"].values and stemmed_word not in simple_words["singular"].values and stemmed_word not in simple_words["word"].values:
        return True
    return False

In [8]:
# function to clean paragraph
def clean_paragraph(para):
    # loop the string
    arr = re.split('[\W+\d+]', para)
    # transform each word in arr above: filter to get words with length > 3 then uppercase all words
    new_arr = list(map(lambda y: y.upper(), (filter(filter_word, arr))))
    # remove duplicate words
    new_arr = list(dict.fromkeys(new_arr))
    return new_arr

In [9]:
raw["paragraph_arr"] = raw["paragraph"].apply(clean_paragraph)

In [10]:
raw = raw.drop(columns=['paragraph'])
raw

Unnamed: 0,title,paragraph_arr
0,"How To Work In Data Science, AI, Big Data - KD...","[FACETS, DEPEND, PURSUE, SUCCEED]"
1,"How To Work In Data Science, AI, Big Data - KD...","[SCIENTISTS, TYPICALLY, SHAPED, HAPPIER]"
2,"How To Work In Data Science, AI, Big Data - KD...","[PAGEGROUP, ARCHITECT, TRANSFORMATION, ANALYTICS]"
3,"How To Work In Data Science, AI, Big Data - KD...","[JUSTGIVING, RAISE, ACQUIRED]"
4,"How To Work In Data Science, AI, Big Data - KD...","[JUSTGIVING, TYPICAL, CAPTURE, PIPELINE, INVES..."
...,...,...
5395,Data Avengers… Assemble! - KDnuggets,"[STEPHEN, STRANGE, SORCERER, SUPREME, GUARDIAN..."
5396,Data Avengers… Assemble! - KDnuggets,"[IRIS, DATASET, NEURAL, ENSEMBLE, PREFERS, STA..."
5397,Data Avengers… Assemble! - KDnuggets,"[ANDROID, AVENGER, AUTOMATED, HYBRID, BAYESIAN..."
5398,Data Avengers… Assemble! - KDnuggets,"[SUPPLANTED, COMPLEMENTARY, FLESHY, SCIENTISTS..."


In [11]:
lst_col = 'paragraph_arr'
raw = pd.DataFrame({
    col:np.repeat(raw[col].values, raw[lst_col].str.len())
    for col in raw.columns.difference([lst_col])
}).assign(**{lst_col:np.concatenate(raw[lst_col].values)})[raw.columns.tolist()]
raw

Unnamed: 0,title,paragraph_arr
0,"How To Work In Data Science, AI, Big Data - KD...",FACETS
1,"How To Work In Data Science, AI, Big Data - KD...",DEPEND
2,"How To Work In Data Science, AI, Big Data - KD...",PURSUE
3,"How To Work In Data Science, AI, Big Data - KD...",SUCCEED
4,"How To Work In Data Science, AI, Big Data - KD...",SCIENTISTS
...,...,...
35386,Data Avengers… Assemble! - KDnuggets,SCIENTISTS
35387,Data Avengers… Assemble! - KDnuggets,UNEMPLOYED
35388,Data Avengers… Assemble! - KDnuggets,WINK
35389,Data Avengers… Assemble! - KDnuggets,COGNITIVE


In [12]:
raw['freq'] = 1

In [13]:
final = raw.groupby('paragraph_arr').agg({
    'freq': 'sum',
    'title': "nunique",
})

In [14]:
final["rank"] = final["freq"] * final["title"]
final = final.sort_values(by=['rank'], ascending=False).reset_index().head(1300)

In [15]:
def get_meaning(w):
    meaning, example = translate(w)
    return pd.Series([meaning, example])

In [16]:
final[["meaning", "example"]] = final["paragraph_arr"].apply(get_meaning)

tps://api.dictionaryapi.dev/api/v2/entries/en_US/SUPERIEURE
https://api.dictionaryapi.dev/api/v2/entries/en_US/LECTURES
https://api.dictionaryapi.dev/api/v2/entries/en_US/POSSESS
https://api.dictionaryapi.dev/api/v2/entries/en_US/HASN
https://api.dictionaryapi.dev/api/v2/entries/en_US/SUPERIOR
https://api.dictionaryapi.dev/api/v2/entries/en_US/REPEATEDLY
https://api.dictionaryapi.dev/api/v2/entries/en_US/INSPIRING
https://api.dictionaryapi.dev/api/v2/entries/en_US/LEAP
https://api.dictionaryapi.dev/api/v2/entries/en_US/CLEVER
https://api.dictionaryapi.dev/api/v2/entries/en_US/DIVING
https://api.dictionaryapi.dev/api/v2/entries/en_US/AUDITORY
https://api.dictionaryapi.dev/api/v2/entries/en_US/UNLOCK
https://api.dictionaryapi.dev/api/v2/entries/en_US/MERGE
https://api.dictionaryapi.dev/api/v2/entries/en_US/DECREASING
https://api.dictionaryapi.dev/api/v2/entries/en_US/ADAPTIVE
https://api.dictionaryapi.dev/api/v2/entries/en_US/BIODIVERSITY
https://api.dictionaryapi.dev/api/v2/entries/en_U

In [23]:
final = final[final["meaning"] != ""]
final = final[final["example"] != ""]

In [24]:
for word in final['paragraph_arr']:
    for paragraph in paragraphs:
        arr = paragraph.split(".")
        for sentence in arr:
            if word in sentence.upper():
                final.loc[lambda final: final['paragraph_arr'] == word,["sentence"]] = sentence
            break

In [25]:
final = final.reset_index(drop=True)

In [26]:
conditions = [
    final.index <= round(len(final) / 4),
    final.index <= round(2 * len(final) / 4),
    final.index <= round(3 * len(final) / 4),
    final.index <= len(final) - 1
]
choices = ['1', '2', '3', '4']
final['rating'] = np.select(conditions, choices)

In [27]:
final

Unnamed: 0,paragraph_arr,freq,title,rank,meaning,example,sentence,rating
0,SCIENTISTS,491,271,133061,A person who is studying or has expert knowled...,a research scientist,Don't torture your data scientists by witholdi...,1
1,PYTHON,286,163,46618,A large heavy-bodied nonvenomous snake occurri...,Anacondas are related to boa constrictors and ...,Munge data? Run some analysis? Whip up a class...,1
2,SOLVE,156,118,18408,"Find an answer to, explanation for, or means o...",the policy could solve the town's housing crisis,More certain am I of this than I am that Marve...,1
3,ALGORITHM,140,99,13860,A process or set of rules to be followed in ca...,a basic algorithm for division,More certain am I of this than I am that Marve...,1
4,PREDICTIVE,134,88,11792,Relating to or having the effect of predicting...,predictive accuracy,"Predictive analytics? Maybe the closest fit, b...",1
...,...,...,...,...,...,...,...,...
1092,DEMONSTRATING,6,5,30,Give a practical exhibition and explanation of...,computerized design methods will be demonstrated,If you are trying to find your first path into...,4
1093,PROCEED,6,5,30,Begin or continue a course of action.,we can proceed with our investigation,A reference to the term “Data Science” was mad...,4
1094,INGESTION,6,5,30,"The process of taking food, drink, or another ...",vomiting after ingestion of contaminated food,The last decade has seen many areas of researc...,4
1095,EXAMINING,6,5,30,Inspect (someone or something) thoroughly in o...,a doctor examined me and said I might need a c...,They do so by examining how good or bad the ex...,4


In [28]:
final.to_csv('output.csv', index=False)