In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Prepare all raw data
raw = pd.read_csv('raw_data.csv')
simple_words = pd.read_csv('simple_words.csv')

In [3]:
# Prepare dataframe of simple words for filter later
simple_words = simple_words.head(3000)
simple_words.dropna(inplace = True)
simple_words["word"] = simple_words["word"].apply(lambda x: x.upper())
simple_words["word_s"] = simple_words["word"].apply(lambda x: x.upper() + "S")
simple_words["word_ed"] = simple_words["word"].apply(lambda x: x.upper() + "D" if x[-1].upper() == "E" else x.upper() + "ED")
simple_words

Unnamed: 0,word,count,word_s,word_ed
0,THE,23135851162,THES,THED
1,OF,13151942776,OFS,OFED
2,AND,12997637966,ANDS,ANDED
3,TO,12136980858,TOS,TOED
4,A,9081174698,AS,AED
...,...,...,...,...
2995,SEXO,25849932,SEXOS,SEXOED
2996,AP,25843002,APS,APED
2997,PATTERNS,25809816,PATTERNSS,PATTERNSED
2998,BOXES,25787744,BOXESS,BOXESED


In [4]:
raw.head(3)

Unnamed: 0,title,paragraph
0,Top 8 Data Science Use Cases in Manufacturing ...,The manufacturing business faces huge transfor...
1,Top 8 Data Science Use Cases in Manufacturing ...,The amount of data to be stored and processed ...
2,Top 8 Data Science Use Cases in Manufacturing ...,Predictive analytics is the analysis of presen...


In [5]:
raw["paragraph"].iloc[3]

'Both these prediction models are aimed at forecasting the moment when the equipment fails to perform the task. As a result, the secondary goal may be achieved \xa0- to prevent these failures from happening or at least to reduce their number. This becomes possible due to the numerous predictive techniques'

In [6]:
# function that filter word with length > 3 and not in simple words
def filter_word(w):
    if len(w) > 3 and w.upper() not in simple_words["word"].values and w.upper() not in simple_words["word_s"].values and w.upper() not in simple_words["word_ed"].values:
        return True
    return False

In [7]:
# function to clean paragraph
def clean_paragraph(para):
    # loop the string
    arr = re.split('[\W+\d+]', para)
    # transform each word in arr above: filter to get words with length > 3 then uppercase all words
    new_arr = list(map(lambda y: y.upper(), (filter(filter_word, arr))))
    # remove duplicate words
    new_arr = list(dict.fromkeys(new_arr))
    return new_arr

In [8]:
raw["paragraph_arr"] = raw["paragraph"].apply(clean_paragraph)

In [9]:
raw = raw.drop(columns=['paragraph'])
raw

Unnamed: 0,title,paragraph_arr
0,Top 8 Data Science Use Cases in Manufacturing ...,"[TRANSFORMATIONS, NOWADAYS, RAPID, BROAD, SEEK..."
1,Top 8 Data Science Use Cases in Manufacturing ...,"[AUTOMATE, EXECUTION]"
2,Top 8 Data Science Use Cases in Manufacturing ...,"[PREDICTIVE, ANALYTICS, PROBLEMATIC, DEEPLY, F..."
3,Top 8 Data Science Use Cases in Manufacturing ...,"[PREDICTION, FORECASTING, HAPPENING, NUMEROUS,..."
4,Top 8 Data Science Use Cases in Manufacturing ...,"[PREVENTIVE, LESSEN, LIKELIHOOD, FAILING, USAG..."
...,...,...
5395,Practical Data Science: Building Minimum Viabl...,"[CONSIST, JUNK, MODELING, EXPERIMENTING, COMMU..."
5396,Data Science Basics: An Introduction to Ensemb...,"[GARAGES, IMAGINE, HUNDREDS, BAGGING, BAGGED, ..."
5397,Data Science Basics: An Introduction to Ensemb...,"[GARAGE, NUMEROUS, DIAGNOSIS, SUPPOSE, INTERAC..."
5398,Data Science Basics: An Introduction to Ensemb...,"[COMBINER, ALGORITHM, PREDICTIONS, ALGORITHMS,..."


In [10]:
lst_col = 'paragraph_arr'
raw = pd.DataFrame({
    col:np.repeat(raw[col].values, raw[lst_col].str.len())
    for col in raw.columns.difference([lst_col])
}).assign(**{lst_col:np.concatenate(raw[lst_col].values)})[raw.columns.tolist()]
raw

Unnamed: 0,title,paragraph_arr
0,Top 8 Data Science Use Cases in Manufacturing ...,TRANSFORMATIONS
1,Top 8 Data Science Use Cases in Manufacturing ...,NOWADAYS
2,Top 8 Data Science Use Cases in Manufacturing ...,RAPID
3,Top 8 Data Science Use Cases in Manufacturing ...,BROAD
4,Top 8 Data Science Use Cases in Manufacturing ...,SEEK
...,...,...
45971,Data Science Basics: An Introduction to Ensemb...,ACCOMPLISH
45972,Data Science Basics: An Introduction to Ensemb...,TACKLE
45973,Data Science Basics: An Introduction to Ensemb...,SOLVABLE
45974,Data Science Basics: An Introduction to Ensemb...,ARTIFICIAL


In [11]:
raw['freq'] = 1

In [14]:
final = raw.groupby('paragraph_arr').agg({
    'freq': 'sum',
    'title': "nunique",
}).sort_values(by=['title'], ascending=False).head(50)

In [16]:
final["rank"] = final["freq"] * final["title"]

In [18]:
final.sort_values(by=['title'], ascending=False).head(50)

Unnamed: 0_level_0,freq,title,rank
paragraph_arr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SCIENTISTS,491,271,133061
SCIENTIST,430,235,101050
ANALYTICS,268,171,45828
PYTHON,286,163,46618
ALGORITHMS,220,158,34760
INSIGHTS,163,125,20375
SOLVE,156,118,18408
ALGORITHM,140,99,13860
DATASET,170,95,16150
DOESN,121,95,11495
