In [1]:
import pandas as pd
import numpy as np
import re
import time

from dictionary import translate

In [2]:
# Prepare all raw data
raw = pd.read_csv('raw_data.csv')
simple_words = pd.read_csv('simple_words.csv')

In [3]:
# Prepare dataframe of simple words for filter later
simple_words = simple_words.head(3000)
simple_words.dropna(inplace = True)
simple_words["word"] = simple_words["word"].apply(lambda x: x.upper())
simple_words["word_s"] = simple_words["word"].apply(lambda x: x.upper() + "S")
simple_words["word_ed"] = simple_words["word"].apply(lambda x: x.upper() + "D" if x[-1].upper() == "E" else x.upper() + "ED")
simple_words

Unnamed: 0,word,count,word_s,word_ed
0,THE,23135851162,THES,THED
1,OF,13151942776,OFS,OFED
2,AND,12997637966,ANDS,ANDED
3,TO,12136980858,TOS,TOED
4,A,9081174698,AS,AED
...,...,...,...,...
2995,SEXO,25849932,SEXOS,SEXOED
2996,AP,25843002,APS,APED
2997,PATTERNS,25809816,PATTERNSS,PATTERNSED
2998,BOXES,25787744,BOXESS,BOXESED


In [5]:
sentence = pd.Series(raw["paragraph"])
sentence

0       The manufacturing business faces huge transfor...
1       The amount of data to be stored and processed ...
2       Predictive analytics is the analysis of presen...
3       Both these prediction models are aimed at fore...
4       Preventive maintenance is usually applied to t...
                              ...                        
5395    More than 70% of data science project’s effort...
5396    I only visited 3 garages in my example, but yo...
5397    Thinking again of my car problem, perhaps I ha...
5398    A combiner algorithm is then trained to make u...
5399    An easy mistake for data science newcomers to ...
Name: paragraph, Length: 5400, dtype: object

In [6]:
# function that filter word with length > 3 and not in simple words
def filter_word(w):
    if len(w) > 3 and w.upper() not in simple_words["word"].values and w.upper() not in simple_words["word_s"].values and w.upper() not in simple_words["word_ed"].values:
        return True
    return False

In [7]:
# function to clean paragraph
def clean_paragraph(para):
    # loop the string
    arr = re.split('[\W+\d+]', para)
    # transform each word in arr above: filter to get words with length > 3 then uppercase all words
    new_arr = list(map(lambda y: y.upper(), (filter(filter_word, arr))))
    # remove duplicate words
    new_arr = list(dict.fromkeys(new_arr))
    return new_arr

In [8]:
raw["paragraph_arr"] = raw["paragraph"].apply(clean_paragraph)

In [9]:
raw = raw.drop(columns=['paragraph'])
raw

Unnamed: 0,title,paragraph_arr
0,Top 8 Data Science Use Cases in Manufacturing ...,"[TRANSFORMATIONS, NOWADAYS, RAPID, BROAD, SEEK..."
1,Top 8 Data Science Use Cases in Manufacturing ...,"[AUTOMATE, EXECUTION]"
2,Top 8 Data Science Use Cases in Manufacturing ...,"[PREDICTIVE, ANALYTICS, PROBLEMATIC, DEEPLY, F..."
3,Top 8 Data Science Use Cases in Manufacturing ...,"[PREDICTION, FORECASTING, HAPPENING, NUMEROUS,..."
4,Top 8 Data Science Use Cases in Manufacturing ...,"[PREVENTIVE, LESSEN, LIKELIHOOD, FAILING, USAG..."
...,...,...
5395,Practical Data Science: Building Minimum Viabl...,"[CONSIST, JUNK, MODELING, EXPERIMENTING, COMMU..."
5396,Data Science Basics: An Introduction to Ensemb...,"[GARAGES, IMAGINE, HUNDREDS, BAGGING, BAGGED, ..."
5397,Data Science Basics: An Introduction to Ensemb...,"[GARAGE, NUMEROUS, DIAGNOSIS, SUPPOSE, INTERAC..."
5398,Data Science Basics: An Introduction to Ensemb...,"[COMBINER, ALGORITHM, PREDICTIONS, ALGORITHMS,..."


In [10]:
lst_col = 'paragraph_arr'
raw = pd.DataFrame({
    col:np.repeat(raw[col].values, raw[lst_col].str.len())
    for col in raw.columns.difference([lst_col])
}).assign(**{lst_col:np.concatenate(raw[lst_col].values)})[raw.columns.tolist()]
raw

Unnamed: 0,title,paragraph_arr
0,Top 8 Data Science Use Cases in Manufacturing ...,TRANSFORMATIONS
1,Top 8 Data Science Use Cases in Manufacturing ...,NOWADAYS
2,Top 8 Data Science Use Cases in Manufacturing ...,RAPID
3,Top 8 Data Science Use Cases in Manufacturing ...,BROAD
4,Top 8 Data Science Use Cases in Manufacturing ...,SEEK
...,...,...
45971,Data Science Basics: An Introduction to Ensemb...,ACCOMPLISH
45972,Data Science Basics: An Introduction to Ensemb...,TACKLE
45973,Data Science Basics: An Introduction to Ensemb...,SOLVABLE
45974,Data Science Basics: An Introduction to Ensemb...,ARTIFICIAL


In [11]:
raw['freq'] = 1

In [14]:
final = raw.groupby('paragraph_arr').agg({
    'freq': 'sum',
    'title': "nunique",
}).sort_values(by=['title'], ascending=False).reset_index().head(300)

In [15]:
final["rank"] = final["freq"] * final["title"]

In [18]:
final.sort_values(by=['rank'], ascending=False).tail(20)

Unnamed: 0,paragraph_arr,freq,title,rank
248,DIVE,23,23,529
235,SOLVED,23,23,529
249,CONSIDERING,23,23,529
282,CALCULATE,24,22,528
271,RESULTING,24,22,528
267,RECOGNIZE,24,22,528
275,ADDITIONALLY,24,22,528
262,INTEGRATE,24,22,528
285,PLENTY,24,22,528
279,CONSTANT,24,22,528


In [19]:
def get_meaning(w):
    time.sleep(0)
    meaning, example = translate(w)
    return pd.Series([meaning, example])

In [21]:
final[["meaning", "example"]] = final["paragraph_arr"].apply(get_meaning)

https://api.dictionaryapi.dev/api/v2/entries/en_US/SCIENTISTS
https://api.dictionaryapi.dev/api/v2/entries/en_US/SCIENTIST
https://api.dictionaryapi.dev/api/v2/entries/en_US/ANALYTICS
https://api.dictionaryapi.dev/api/v2/entries/en_US/PYTHON
https://api.dictionaryapi.dev/api/v2/entries/en_US/ALGORITHMS
https://api.dictionaryapi.dev/api/v2/entries/en_US/INSIGHTS
https://api.dictionaryapi.dev/api/v2/entries/en_US/SOLVE
https://api.dictionaryapi.dev/api/v2/entries/en_US/ALGORITHM
https://api.dictionaryapi.dev/api/v2/entries/en_US/DATASET
https://api.dictionaryapi.dev/api/v2/entries/en_US/DOESN
https://api.dictionaryapi.dev/api/v2/entries/en_US/STATISTICAL
https://api.dictionaryapi.dev/api/v2/entries/en_US/PREDICTIVE
https://api.dictionaryapi.dev/api/v2/entries/en_US/EXPERTISE
https://api.dictionaryapi.dev/api/v2/entries/en_US/PREDICT
https://api.dictionaryapi.dev/api/v2/entries/en_US/VISUALIZATION
https://api.dictionaryapi.dev/api/v2/entries/en_US/MODELING
https://api.dictionaryapi.dev/ap

In [24]:
final = final[final["meaning"] != ""]

In [25]:
final

Unnamed: 0,paragraph_arr,freq,title,rank,meaning,example
0,SCIENTISTS,491,271,133061,A person who is studying or has expert knowled...,a research scientist
1,SCIENTIST,430,235,101050,A person who is studying or has expert knowled...,
3,PYTHON,286,163,46618,A large heavy-bodied nonvenomous snake occurri...,Anacondas are related to boa constrictors and ...
6,SOLVE,156,118,18408,"Find an answer to, explanation for, or means o...",the policy could solve the town's housing crisis
7,ALGORITHM,140,99,13860,A process or set of rules to be followed in ca...,a basic algorithm for division
...,...,...,...,...,...,...
295,SPLIT,29,21,609,"Break or cause to break forcibly into parts, e...",the ice cracked and heaved and split
296,DIVERSE,21,21,441,Showing a great deal of variety; very different.,"subjects as diverse as architecture, language ..."
297,BASELINE,25,21,525,A minimum or starting point used for comparisons.,An alternative to creating external baselines ...
298,POSSIBILITIES,21,21,441,A thing that may happen or be the case.,relegation remains a distinct possibility


In [27]:
for word in final['paragraph_arr']:
    for sent in sentence:
        if word in sent.upper():
            final.loc[lambda final: final['paragraph_arr'] == word,["sentence"]] = sent
            break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [28]:
final

Unnamed: 0,paragraph_arr,freq,title,rank,meaning,example,sentence
0,SCIENTISTS,491,271,133061,A person who is studying or has expert knowled...,a research scientist,Data preparation is particularly important. Da...
1,SCIENTIST,430,235,101050,A person who is studying or has expert knowled...,,Planning research projects is hard because the...
3,PYTHON,286,163,46618,A large heavy-bodied nonvenomous snake occurri...,Anacondas are related to boa constrictors and ...,The open source frameworks and programming lan...
6,SOLVE,156,118,18408,"Find an answer to, explanation for, or means o...",the policy could solve the town's housing crisis,Failing fast is maybe my most important point ...
7,ALGORITHM,140,99,13860,A process or set of rules to be followed in ca...,a basic algorithm for division,AI-powered technologies and computer vision ap...
...,...,...,...,...,...,...,...
295,SPLIT,29,21,609,"Break or cause to break forcibly into parts, e...",the ice cracked and heaved and split,This will create a visualisation of your data ...
296,DIVERSE,21,21,441,Showing a great deal of variety; very different.,"subjects as diverse as architecture, language ...",No matter where you are in your data-driven jo...
297,BASELINE,25,21,525,A minimum or starting point used for comparisons.,An alternative to creating external baselines ...,What is a good performance is a pretty hard qu...
298,POSSIBILITIES,21,21,441,A thing that may happen or be the case.,relegation remains a distinct possibility,Data science has brought new marvelous opportu...
