In [118]:
import pandas as pd

# Load the data
materials = pd.read_csv('./data/Courses and Learning Material.csv')
lpi = pd.read_csv('./data/Leaning Pathway Index.csv')

# Display the first few rows of the dataset
materials.head()

Unnamed: 0,Tab name,Source,Course Level,Duration,Prerequisites,Prework,Course / Learning material,Course / Learning material Link,Type (Free or Paid)
0,CLMML00,Google Developers,Beginners,70 mins,No,No,Introduction to Machine Learning,https://developers.google.com/machine-learning...,Free
1,CLMML01,Google Developers,Beginners to Intermediate,,"Yes, a handful",Yes,Machine Learning Crash Course (Foundation),https://developers.google.com/machine-learning...,Free
2,CLMML02,Google Developers,Beginners to Intermediate,45 mins,No,No,Problem Framing (ML related),https://developers.google.com/machine-learning...,Free
3,CLMML03,Google Developers,Beginners to Intermediate,,No,No,Data Preparation and Feature Engineering in ML,https://developers.google.com/machine-learning...,Free
4,CLMML04,Google Developers,Beginners to Intermediate,,Yes,No,Testing and Debugging,https://developers.google.com/machine-learning...,Free


# Clean up Course Level

In [119]:
lpi["Course Level"].value_counts()

Course Level
Intermediate to Advanced     899
Beginner                     244
Intermediate                  99
Beginners to Intermediate     97
Beginner to Intermediate      39
Beginners                      4
Name: count, dtype: int64

In [120]:
level_dict = {"Beginners":"Beginner","Beginners to Intermediate":"Beginner to Intermediate"}
lpi["Course Level"] = lpi["Course Level"].replace(level_dict)
lpi["Course Level"].value_counts()

Course Level
Intermediate to Advanced    899
Beginner                    248
Beginner to Intermediate    136
Intermediate                 99
Name: count, dtype: int64

# Clean up keywords

## Find out keywords that need to be corrected

In [121]:
keyword = lpi.copy()
keyword["Keywords / Tags / Skills / Interests / Categories"].value_counts()

Keywords / Tags / Skills / Interests / Categories
Autoencoders, Model Initialization, Dropout Techniques, Attention Mechanisms, Super-resolution    67
Machine Learning/Deployment/Cloud, Lab, AI, task,Qwik Start, Command Line, Challenge              24
Machine learning/Cloud, ML Pipeline, TensorFlow, Pipeline, tfx                                    23
ML system/Data distribution/System failure/ Data validation/Tensorflow                            20
ML System/Training/Prediction/Keras                                                               19
                                                                                                  ..
data engineer, Big Data, Machine Learning, Google Cloud, storage                                   1
data engineer, Big Data, Machine Learning, Google Cloud, lab, BigQuery                             1
data engineer, Big Data, Machine Learning, Google Cloud, quiz                                      1
data engineer, streaming data, Big data, 

In [122]:
# I will revert the col name in the end.
keyword.rename(columns={"Keywords / Tags / Skills / Interests / Categories":"Keywords"}, inplace=True)
keyword['Keywords'] = keyword['Keywords'].str.replace('/', ',')
keyword.head()

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Keywords,Links
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,20 mins,Easy,machine learning,https://developers.google.com/machine-learning...
1,Introduction to Machine Learning,Google Developers,Beginner,Free,What is Machine Learning,20 mins,Easy,machine learning,https://developers.google.com/machine-learning...
2,Introduction to Machine Learning,Google Developers,Beginner,Free,Supervised Learning,20 mins,Easy to Medium,supervised learning,https://developers.google.com/machine-learning...
3,Introduction to Machine Learning,Google Developers,Beginner,Free,Test your understanding,10 mins,Easy,machine learning test,https://developers.google.com/machine-learning...
4,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Introduction to ML,3 mins,Easy,machine learning,https://developers.google.com/machine-learning...


In [123]:
# check if all '/' are gone
keyword.query("Keywords.str.contains('/', regex=False)").head(2)

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Keywords,Links


In [124]:
# Split and explode the Keywords
keyword['Keywords'] = keyword['Keywords'].str.lower().str.split(',').apply(lambda x: [i.strip() for i in x])
keyword = keyword.explode('Keywords')

In [125]:
from nltk.stem import WordNetLemmatizer
import nltk
import numpy as np

# Download WordNet
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization
keyword['Keywords_lemmatized'] = keyword['Keywords'].apply(lambda x: lemmatizer.lemmatize(x))
keyword['Keywords_lemmatized'].replace("", np.nan, inplace=True)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lorentzyeung/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:
# check if all space is gone
keyword[keyword['Keywords_lemmatized']==""]["Keywords_lemmatized"]

Series([], Name: Keywords_lemmatized, dtype: object)

In [127]:
cleaned_up = set(keyword.Keywords_lemmatized.unique())
before = set(keyword.Keywords.unique())
changed = before - cleaned_up
changed

{'',
 'algorithms',
 'bugs',
 'categories',
 'causes',
 'classifications',
 'components',
 'crosses',
 'dependencies',
 'developers',
 'epochs',
 'ethics',
 'examples',
 'features',
 'guidelines',
 'inputs',
 'keras',
 'labels',
 'logs',
 'metrics',
 'models',
 'nodes',
 'outliers',
 'outputs',
 'phases',
 'pipelines',
 'predictions',
 'preferences',
 'problems',
 'qualities',
 'queries',
 'requirements',
 'samples',
 'sets',
 'sinks',
 'slices',
 'sources',
 'stars',
 'statistics',
 'steps',
 'subsets',
 'tables',
 'technologies',
 'tests',
 'triggers',
 'types',
 'users',
 'values',
 'watermarks'}

In [128]:
# checking each key in the dict and how it should be changed
keyword[keyword['Keywords'].str.contains("llm's")][["Keywords", "Keywords_lemmatized"]]

Unnamed: 0,Keywords,Keywords_lemmatized
746,llm's,llm's
747,llm's,llm's
748,llm's,llm's
749,llm's,llm's
750,llm's,llm's
751,llm's,llm's
752,llm's,llm's
753,llm's,llm's
754,llm's,llm's
755,llm's,llm's


In [129]:
# creating a dict to use in map function to clean up other keywords related columns
mapping_dict = {
 'artificial intelligence':'ai', # my preference
 'bigquery':'big query', # my preference
 'bigdata':'big data', # my preference
 'collaborative-filtering':'collaborative filtering', # my preference
 'cnns':'cnn', # my preference
 'content-based-filtering':'content based filtering', # my preference
 'convolutional neural networks (cnns)':'cnn', # my preference
 'databses':'database', # spelling
 'datatest':'datatset', # spelling
 'drop out':'dropout', # my preference
 'gaussain':'gaussian', # spelling
 'generalisation':'generalization', # my preference
 'inferencing':'inference', # my preference
 'intrepretation':'interpretation', # spelling
 'licence':'license', # spelling
 "llm's":'llm', # my preference
 'machine learning':'ml', # my preference
 'regrression':'regression', # my preference
 'monitoring':'monitor', # my preference
 'natural language processing (nlp)':'nlp', # my preference
 'natural language processing':'nlp', # my preference
 'pipleline':'pipeline', # my preference
 'rdds':'rdd', # my preference
 'systems':'system', # my preference
 'recurrent neural network':'rnn', # my preference
 'regularisation':'regularization', # my preference
 'regulisation':'regularization', # spelling
 'severless':'serverless', # spelling
 'explainations':'explanation', # spelling
 'ssampling':'sampling', # spelling
 'statis':'statistic', # spelling
 'stochastic gradient descent':'sgd', # my preference
 'super-resolution':'super resolution', # my preference
 'tensor flow':'tensorflow', # my preference
 'tune':'tunning', # my preference
 'vetex':'vertex', # spelling
 'visualisation':'visualization', # my preference




 'algorithms':'algorithm',
 'bugs':'bug',
 'categories':'category',
 'causes':'cause',
 'classifications':'classification',
 'components':"component",
 'crosses':'cross',
 'dependencies':'dependency',
 'developers':'developer',
 'epochs':'epoch',
 'ethics':'ethic',
 'examples':'example',
 'embeddings':'embedding', # lemmatization didn't handle
 'features':'feature',
 'guidelines':'guideline',
 'hyperparameter.':'hyperparameter', # spelling
 'hyperparameters':'hyperparameter', # lemmatization didn't handle
 'hyperparamters':'hyperparameter', # spelling
 'inputs':'input',
 'keras':'kera',
 'labels':'label',
 'logs':'log',
 'metrics':'metric',
 'models':'models',
 'networks':'network', # lemmatization didn't handle
 'nodes':'node',
 'outliers':'outliers',
 'outputs':'output',
 'phases':'phase',
 'pipelines':'pipeline',
 'predictions':'prediction',
 'predicition':'prediction', # spelling
 'preferences':'preference',
 'problems':'problem',
 'qualities':'quality',
 'queries':'query',
 'requirements':'requirement',
 'samples':'sample',
 'sets':'set', # be careful with this, there are partitioned datasets, querying datasets, subsets, datasets, sets
 'sinks':'sink',
 'slices':'slice',
 'sources':'source',
 'stars':'star',
 'statistics':'statistic',
 'steps':'step',
 'subsets':'subset',
 'tables':'table',
 'technologies':'technology',
 'tests':'test',
 'triggers':'trigger',
 'types':'types',
 'users':'user',
 'values':'value',
 'watermarks':'watermark'
 }

In [130]:
keyword.head(1)

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Keywords,Links,Keywords_lemmatized
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,20 mins,Easy,machine learning,https://developers.google.com/machine-learning...,machine learning


In [131]:
# Apply the replacements
keyword["Keywords2"] = keyword.Keywords

for k, v in mapping_dict.items():
    keyword["Keywords2"] = keyword["Keywords2"].str.replace(k, v)

# Check the results
keyword[keyword['Keywords'].str.contains("machine learning|artificial intelligence")][["Keywords", "Keywords_lemmatized", "Keywords2"]]


Unnamed: 0,Keywords,Keywords_lemmatized,Keywords2
0,machine learning,machine learning,ml
1,machine learning,machine learning,ml
3,machine learning test,machine learning test,ml test
4,machine learning,machine learning,ml
7,machine learning,machine learning,ml
...,...,...,...
1377,machine learning,machine learning,ml
1378,machine learning,machine learning,ml
1379,machine learning,machine learning,ml
1380,machine learning,machine learning,ml


In [132]:
keyword.head(1)

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Keywords,Links,Keywords_lemmatized,Keywords2
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,20 mins,Easy,machine learning,https://developers.google.com/machine-learning...,machine learning,ml


In [133]:
keyword.Keywords2.unique()

array(['ml', 'supervised learning', 'ml test', 'problem statement',
       'ml terminologies', 'linear regression', 'training', 'loss',
       'reducing loss', 'iterative approach', 'gradient descent',
       'learning rate', 'optimising', 'stochastic', 'practical',
       'tensorflow', 'toolkit', 'generalization', 'overfitting', 'test',
       'dataset', 'splitting', 'data', 'validation set', 'partition',
       'representation', 'feature engineering', 'quality', 'good feature',
       'cleaning', 'feature', 'cross', 'encoding', 'nonlinearity',
       'one-hot vectors', 'regularization', 'simplicity', 'l2', 'lambda',
       'logistic regression', 'probability', 'classification',
       'threshold', 'confusion matrix', 'accuracy', 'precision', 'recall',
       'roc', 'roc curve', 'auc', 'prediction', 'bias', 'sparcity', 'l1',
       'neural network', 'softmax', 'embedding',
       'collaborative filtering', 'categorical data',
       'low dimensional space', 'production', 'ml system', 

In [134]:
keyword.rename(columns={"Keywords":"Keywords / Tags / Skills / Interests / Categories"}, inplace=True)


In [135]:
keyword.head()


Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Keywords / Tags / Skills / Interests / Categories,Links,Keywords_lemmatized,Keywords2
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,20 mins,Easy,machine learning,https://developers.google.com/machine-learning...,machine learning,ml
1,Introduction to Machine Learning,Google Developers,Beginner,Free,What is Machine Learning,20 mins,Easy,machine learning,https://developers.google.com/machine-learning...,machine learning,ml
2,Introduction to Machine Learning,Google Developers,Beginner,Free,Supervised Learning,20 mins,Easy to Medium,supervised learning,https://developers.google.com/machine-learning...,supervised learning,supervised learning
3,Introduction to Machine Learning,Google Developers,Beginner,Free,Test your understanding,10 mins,Easy,machine learning test,https://developers.google.com/machine-learning...,machine learning test,ml test
4,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Introduction to ML,3 mins,Easy,machine learning,https://developers.google.com/machine-learning...,machine learning,ml


## Execute on the original dataframe

In [136]:
lpi2 = lpi.copy()

In [137]:
# I will revert the col name in the end.
lpi2.rename(columns={"Keywords / Tags / Skills / Interests / Categories":"Keywords"}, inplace=True)
lpi2['Keywords'] = lpi2['Keywords'].str.replace('/', ',')
lpi2.tail(5)

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Keywords,Links
1377,Machine Learning Engineer Learning Path,Google Cloud Skill Boost,Intermediate to Advanced,Free during mentorship period,Build and Deploy Machine Learning Solutions on...,90 miniutes,Medium to Hard,"Machine Learning,Deployment,Cloud, Vertex AI, ...",https://www.cloudskillsboost.google/course_ses...
1378,Machine Learning Engineer Learning Path,Google Cloud Skill Boost,Intermediate to Advanced,Free during mentorship period,Build and Deploy Machine Learning Solutions on...,90 minutes,Medium to Hard,"Machine Learning,Deployment,Cloud, Vertex AI, ...",https://www.cloudskillsboost.google/course_ses...
1379,Machine Learning Engineer Learning Path,Google Cloud Skill Boost,Intermediate to Advanced,Free during mentorship period,Build and Deploy Machine Learning Solutions on...,90 minutes,Medium to Hard,"Machine Learning,Deployment,Cloud, Vertex AI, ...",https://www.cloudskillsboost.google/course_ses...
1380,Machine Learning Engineer Learning Path,Google Cloud Skill Boost,Intermediate to Advanced,Free during mentorship period,Build and Deploy Machine Learning Solutions on...,90 minutes,Medium to Hard,"Machine Learning,Deployment,Cloud, Vertex AI, ...",https://www.cloudskillsboost.google/course_ses...
1381,Machine Learning Engineer Learning Path,Google Cloud Skill Boost,Intermediate to Advanced,Free during mentorship period,Build and Deploy Machine Learning Solutions on...,120 minutes,Medium to Hard,"Machine Learning,Deployment,Cloud, Vertex AI, ...",Link Not Available


In [138]:
# Apply the replacements
lpi2["Keywords2"] = lpi2["Keywords"]

for k, v in mapping_dict.items():
    lpi2["Keywords2"] = lpi2["Keywords2"].str.replace(k, v)

# Check the results
lpi2[lpi2['Keywords'].str.contains("machine learning|artificial intelligence")][["Keywords", "Keywords2"]]


Unnamed: 0,Keywords,Keywords2
0,machine learning,ml
1,machine learning,ml
3,machine learning test,ml test
4,machine learning,ml
7,machine learning,ml
8,"machine learning, linear regrression","ml, linear regression"
9,"machine learning, training, loss","ml, training, loss"
85,"glossary, machine learning","glossary, ml"
539,"data, samples, label, machine learning","data, sample, label, ml"
550,"data, label, split, training, testing, validat...","data, label, split, training, testing, validat..."


In [139]:
lpi2.drop(columns="Keywords", axis=1, inplace=True)
lpi2.rename(columns={"Keywords2" : "Keywords / Tags / Skills / Interests / Categories"}, inplace=True)
lpi = lpi2.copy()

In [140]:
lpi[lpi["Keywords / Tags / Skills / Interests / Categories"].str.contains("ml|set")]


Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Links,Keywords / Tags / Skills / Interests / Categories
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,20 mins,Easy,https://developers.google.com/machine-learning...,ml
1,Introduction to Machine Learning,Google Developers,Beginner,Free,What is Machine Learning,20 mins,Easy,https://developers.google.com/machine-learning...,ml
3,Introduction to Machine Learning,Google Developers,Beginner,Free,Test your understanding,10 mins,Easy,https://developers.google.com/machine-learning...,ml test
4,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Introduction to ML,3 mins,Easy,https://developers.google.com/machine-learning...,ml
6,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Framing - Key ML Terminology,15 mins,Easy,https://developers.google.com/machine-learning...,ml terminologies
7,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Descending into ML - Video Lecture,,Easy to Medium,https://developers.google.com/machine-learning...,ml
8,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Descending into ML - Linear Regression,,Easy to Medium,https://developers.google.com/machine-learning...,"ml, linear regression"
9,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Descending into ML - Training and Loss,,Easy to Medium,https://developers.google.com/machine-learning...,"ml, training, loss"
21,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Training and Test Sets - Video Lecture,,Easy,https://developers.google.com/machine-learning...,"training, test, dataset"
22,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Training and Test Sets - Splitting Data,,Easy,https://developers.google.com/machine-learning...,"training, test, dataset, splitting, data"


# cleaning up units in duration

In [141]:
lpi.Duration.value_counts()

Duration
1 min          121
2 minutes      113
5 min           91
2 min           63
1 minute        61
              ... 
1mins            1
4:34mins         1
2hrs             1
24 secs          1
90 miniutes      1
Name: count, Length: 185, dtype: int64

In [142]:
import pandas as pd
import re
import numpy as np

# Function to extract string units
def extract_unit(text):
    if isinstance(text, str):  # Check if the input is a string
        # Extract the unit (min, hour, sec, etc.)
        unit = re.search(r'[a-zA-Z]+', text).group().lower()
        return unit
    else:
        return None  # Return None if the input is not a string

# Apply the function to create a new column for units
lpi['Unit'] = lpi['Duration'].apply(extract_unit)

# Count the occurrences of each distinct unit
unit_counts = lpi['Unit'].value_counts()

print(unit_counts)

Unit
min         594
minutes     518
mins        115
minute       61
hours        19
secs          4
to            3
minutess      1
miinutes      1
hr            1
mnis          1
sec           1
hrs           1
miniutes      1
Name: count, dtype: int64


In [143]:
unit_counts.index

Index(['min', 'minutes', 'mins', 'minute', 'hours', 'secs', 'to', 'minutess',
       'miinutes', 'hr', 'mnis', 'sec', 'hrs', 'miniutes'],
      dtype='object', name='Unit')

In [144]:
time_map = {}
for i in unit_counts.index:
    time_map[i] = "abc"

time_map

{'min': 'abc',
 'minutes': 'abc',
 'mins': 'abc',
 'minute': 'abc',
 'hours': 'abc',
 'secs': 'abc',
 'to': 'abc',
 'minutess': 'abc',
 'miinutes': 'abc',
 'hr': 'abc',
 'mnis': 'abc',
 'sec': 'abc',
 'hrs': 'abc',
 'miniutes': 'abc'}

In [145]:
# just copy and paste the above and change the values to your liking like below
##### remember to hastag "to"

time_map = {'min': 'minutes',
 'minutes': 'minutes',
 'mins': 'minutes',
 'minute': 'minutes',
 'hours': 'hours',
 'secs': 'seconds',
 # 'to': 'to', ##### <----------
 'minutess': 'minutes',
 'miinutes': 'minutes',
 'hr': 'hours',
 'mnis': 'minutes',
 'sec': 'seconds',
 'hrs': 'hours',
 'miniutes': 'minutes'}


In [146]:
def duration_cleanup(text):
    if isinstance(text, str):  # Check if the input is a string
        if "to" in text:  # Skip if the string contains "to"
            return text  # You can return the original text or None, as you prefer
        # Extract the numerical value
        num = float(re.search(r'\d+', text).group())
        # Extract the unit (min, hour, sec, etc.)
        unit = time_map.get(re.search(r'[a-zA-Z]+', text).group().lower(), "unknown")
        return str(num) + " " + unit
    else:
        return None  # Return None if the input is not a string

In [147]:
# Apply the function to create a new column
lpi['Duration2'] = lpi['Duration'].apply(duration_cleanup)


In [148]:
lpi.query('Unit.str.contains("miinutes|to|mnis|miniutes|sec|hrs", case=False, na=False)').head(20)

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Links,Keywords / Tags / Skills / Interests / Categories,Unit,Duration2
405,Data Engineer - Serverless Data Processing wit...,Google Cloud Skill Boost,Intermediate,Free during mentorship period,Beam Portability: Quiz - Beam Portability,5 miinutes,Easy to Medium,https://www.cloudskillsboost.google/course_ses...,"Cloud, Dataflow, Severless, Data Processing, D...",miinutes,5.0 minutes
425,Data Engineer - Serverless Data Processing wit...,Google Cloud Skill Boost,Intermediate,Free during mentorship period,Serverless Data Processing with Dataflow: Deve...,35 to 36 hours,Medium to Hard,https://www.cloudskillsboost.google/course_tem...,"Cloud, Dataflow, Severless, Data Processing, D...",to,35 to 36 hours
510,Data Engineer - Quest: Perform Foundational Da...,Google Cloud Skill Boost,Beginner,Free during mentorship period,"Perform Foundational Data, ML, and AI Tasks in...",7 to 8 hours,Easy to Medium,https://www.cloudskillsboost.google/course_tem...,"Machine Learning,Deployment,Cloud, Lab, AI, ta...",to,7 to 8 hours
532,Data Engineer - Quest: Engineer Data in Google...,Google Cloud Skill Boost,Beginner,Free during mentorship period,Engineer Data in Google Cloud,5 to 6 hours,Easy to Medium,https://www.cloudskillsboost.google/course_tem...,"data engineer, data preparation, data wranglin...",to,5 to 6 hours
574,Testing and Debugging,Google Developers,Beginner to Intermediate,Free,Testing in production: Check Your Understanding:,5 mnis,Medium,https://developers.google.com/machine-learning...,"data, behavior, model, quality, dataset, predi...",mnis,5.0 minutes
662,Data Engineer - Preparing for the Google Cloud...,Google Cloud Skill Boost,Beginner,Free during mentorship period,Building and Operationalizing Data Processing ...,19sec,Medium,https://www.cloudskillsboost.google/course_ses...,"data, SQL, big query, apache airflow, pipeline...",sec,19.0 seconds
668,Data Engineer - Preparing for the Google Cloud...,Google Cloud Skill Boost,Beginner,Free during mentorship period,Operationalizing Machine Learning Models: Mach...,33secs,Medium,https://www.cloudskillsboost.google/course_ses...,"data, ml, big query, dataflow, tensorflow, Clo...",secs,33.0 seconds
681,Data Engineer - Preparing for the Google Cloud...,Google Cloud Skill Boost,Beginner,Free during mentorship period,Operationalizing Machine Learning Models: Pric...,43 secs,Medium,https://www.cloudskillsboost.google/course_ses...,"data, big query, query validator, pricing calc...",secs,43.0 seconds
684,Data Engineer - Preparing for the Google Cloud...,Google Cloud Skill Boost,Beginner,Free during mentorship period,Operationalizing Machine Learning Models: Chal...,24 secs,Medium,https://www.cloudskillsboost.google/course_ses...,"data, bigtable, cloud storage, big query, SQL,...",secs,24.0 seconds
685,Data Engineer - Preparing for the Google Cloud...,Google Cloud Skill Boost,Beginner,Free during mentorship period,Operationalizing Machine Learning Models: PDE ...,2hrs,Medium,https://www.cloudskillsboost.google/course_ses...,"data, bigtable, cloud storage, big query, SQL,...",hrs,2.0 hours


In [149]:
# Apply the function to create a new column for units
lpi[lpi['Duration2'].str.contains("unknown", na=False)]

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Links,Keywords / Tags / Skills / Interests / Categories,Unit,Duration2


In [150]:
# Apply the function to create a new column for units
lpi['Duration2'].apply(extract_unit).value_counts()

Duration2
minutes    1292
hours        21
seconds       5
to            3
Name: count, dtype: int64

In [151]:
lpi.head()

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Duration,Module / Sub-module \nDifficulty level,Links,Keywords / Tags / Skills / Interests / Categories,Unit,Duration2
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,20 mins,Easy,https://developers.google.com/machine-learning...,ml,mins,20.0 minutes
1,Introduction to Machine Learning,Google Developers,Beginner,Free,What is Machine Learning,20 mins,Easy,https://developers.google.com/machine-learning...,ml,mins,20.0 minutes
2,Introduction to Machine Learning,Google Developers,Beginner,Free,Supervised Learning,20 mins,Easy to Medium,https://developers.google.com/machine-learning...,supervised learning,mins,20.0 minutes
3,Introduction to Machine Learning,Google Developers,Beginner,Free,Test your understanding,10 mins,Easy,https://developers.google.com/machine-learning...,ml test,mins,10.0 minutes
4,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Introduction to ML,3 mins,Easy,https://developers.google.com/machine-learning...,ml,mins,3.0 minutes


In [152]:
lpi.drop(columns=["Duration", "Unit"], axis=1, inplace=True)
lpi.rename(columns={"Duration2" : "Duration"}, inplace=True)
lpi.head()

Unnamed: 0,Course / Learning material,Source,Course Level,Type (Free or Paid),Module,Module / Sub-module \nDifficulty level,Links,Keywords / Tags / Skills / Interests / Categories,Duration
0,Introduction to Machine Learning,Google Developers,Beginner,Free,Introduction to Machine Learning,Easy,https://developers.google.com/machine-learning...,ml,20.0 minutes
1,Introduction to Machine Learning,Google Developers,Beginner,Free,What is Machine Learning,Easy,https://developers.google.com/machine-learning...,ml,20.0 minutes
2,Introduction to Machine Learning,Google Developers,Beginner,Free,Supervised Learning,Easy to Medium,https://developers.google.com/machine-learning...,supervised learning,20.0 minutes
3,Introduction to Machine Learning,Google Developers,Beginner,Free,Test your understanding,Easy,https://developers.google.com/machine-learning...,ml test,10.0 minutes
4,Machine Learning Crash Course (Foundation),Google Developers,Beginner to Intermediate,Free,Introduction to ML,Easy,https://developers.google.com/machine-learning...,ml,3.0 minutes


# viola! all cleaned up!