In [32]:
# %pip install langdetect


import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langdetect import detect
from collections import Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# %pip install deep-translator
from deep_translator import GoogleTranslator

df_jobs = pd.read_csv('jobstreet_techjobs_Jul2025.csv')

def is_en_or_id(text):
    try:
        lang = detect(str(text))
        return lang in ['en', 'id']
    except:
        return False

df_jobs = df_jobs[df_jobs['job_details'].apply(is_en_or_id)].reset_index(drop=True)

# stopwords umum + custom 
stop_words = set(stopwords.words('english')).union(set(stopwords.words('indonesian')))
custom_stopwords = set([
    'pt', 'cv', 'tbk', 'persero', 'dkk', 'dll', 'dst',
    'required', 'requirement', 'requirements',
    'qualifications', 'qualification', 'kualifikasi', 'job description',
    'primary', 'main', ''
    'responsibility', 'responsibilities',
    'benefits', 'benefit', 'tanggung jawab',
    'Gaji Pokok', 'gaji bulanan', 'gaji per bulan',
    'gaji per tahun', 'gaji per jam', 'gaji per hari', 
    'uang makan', 'uang transport', 'uang lembur', 'bonus',
    'tunjangan hari raya', 'thr', 'seeking', 'join', 'tugas', 'welcome',
    'apply', 'apply now', 'apply here', 'apply today', 'apply online',
    'utama'
])
all_stopwords = stop_words.union(custom_stopwords)

# normalisasi teks 
def normalize_text(text):
    text = text.lower()
    emoji_pattern = re.compile("["                     
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_jobs = df_jobs[df_jobs['job_details'].apply(is_en_or_id)].reset_index(drop=True)
df_jobs['job_details'] = df_jobs['job_details'].apply(normalize_text)
all_words = []
for text in df_jobs['job_details']:
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in all_stopwords]
    all_words.extend(filtered_tokens)


titles = df_jobs['job_title'].astype(str).tolist()
translated_titles = [GoogleTranslator(source='auto', target='en').translate(job_title) for job_title in titles]


vectorizer = CountVectorizer(
    ngram_range=(2, 4),
    stop_words=list(all_stopwords),  
    lowercase=True
)

X = vectorizer.fit_transform(translated_titles)
sum_words = X.sum(axis=0)
phrases_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
phrases_freq = sorted(phrases_freq, key=lambda x: x[1], reverse=True)[:90]

for phrase, freq in phrases_freq:
    print(f"{phrase}: {freq}")

technical support: 10
support engineer: 10
d3 data: 5
data analyst: 5
analyst production: 5
production staff: 5
staff code: 5
code prod: 5
prod da: 5
d3 data analyst: 5
data analyst production: 5
analyst production staff: 5
production staff code: 5
staff code prod: 5
code prod da: 5
d3 data analyst production: 5
data analyst production staff: 5
analyst production staff code: 5
production staff code prod: 5
staff code prod da: 5
business analyst: 5
analyst level: 5
business analyst level: 5
helpdesk engineer: 5
engineer technical: 5
helpdesk engineer technical: 5
engineer technical support: 5
technical support engineer: 5
helpdesk engineer technical support: 5
engineer technical support engineer: 5
operation edp: 5
edp support: 5
support staff: 5
operation edp support: 5
edp support staff: 5
operation edp support staff: 5
staff product: 5
product strategy: 5
staff product strategy: 5
customer care: 5
care experience: 5
experience associate: 5
customer care experience: 5
care experience 



In [26]:
df_course = pd.read_csv('dicoding_course_details.csv')

df_course['topic_tags'] = df_course['topic_tags'].str.replace(r'\d+', '', regex=True).str.replace(r',\s*,', ',', regex=True).str.strip(', ')
# Save the DataFrame after removing numbers from 'topic_tags'
df_course.to_csv('dicoding_course_details.csv', index=False)


print("Unique values in 'topic_tag':")
print(df_course['topic_tags'].unique())

print("\nUnique values in 'learning_path':")
print(df_course['learning_path'].unique())


Unique values in 'topic_tag':
['Data' 'Full Stack, Front End' 'Computer Science, Softskill'
 'Data, Programming Language' 'AI, Data' 'Full Stack'
 'Machine Learning, Data' 'Programming Language' 'AI, Machine Learning'
 'Full Stack, Cloud Computing' 'Android, Machine Learning' 'Softskill'
 'Android' 'Full Stack, Back End' 'AI' 'Machine Learning, Flutter'
 'Full Stack, DevOps' 'Computer Science, DevOps' 'Cloud Computing, DevOps'
 'Cloud Computing' 'UI/UX' 'Flutter, Mobile Development'
 'Android, Mobile Development' 'iOS, Android' 'Android, Flutter' 'iOS'
 'Cloud Computing, AWS']

Unique values in 'learning_path':
['Data Scientist' nan 'Back-End  JavaScript' 'DevOps Engineer' 'React'
 'Android' 'Google Cloud Professional' 'Multi-Platform App'
 'Machine Learning Engineer' 'Machine Learning Engineer, Data Scientist'
 'Android, iOS, Multi-Platform App' 'iOS' 'Front-End Web, React'
 'React, Front-End Web' 'Front-End Web' 'Back-End, JavaScript'
 'Google Cloud Professional, React, Front-End Web

In [23]:
import pandas as pd
import numpy as np
import re # Import regex module

# Assuming df_course is your DataFrame
# For demonstration, let's create a sample DataFrame that mimics your data,
# ensuring the problematic entry is present.
data = {
    'learning_path': [
        'Data Scientist',
        'Data Scientist nan Back-End JavaScript DevOps Engineer React', # This is the target for more complex splitting
        'Android \n',
        ', , iOS \n',
        'Front-End Web \n',
        ' , React \n',
        'Google Cloud Professional\n',
        np.nan,
        'Front-End Web',
        'React',
        'Data Scientist , , Multi-Platform App',
        'Front-End Web , React ,',
        'AWS',
        'nan',
        'Full Stack Developer Backend', # Another potential multi-concept entry
        'UI/UX Designer Frontend'
    ],
    'some_other_column': range(1, 17)
}
df_course = pd.DataFrame(data)

print("Original unique values in 'learning_path':")
print(df_course['learning_path'].unique())
print("-" * 50)

known_paths = [
    'Data Scientist', 'JavaScript', 'DevOps Engineer', 'React',
    'Android', 'iOS', 'Front-End Web', 'Google Cloud Professional',
    'Multi-Platform App', 'Back-End', 'Front-End Web'
]

known_paths_pattern = r'\b(?:' + '|'.join(re.escape(p) for p in sorted(known_paths, key=len, reverse=True)) + r')\b'

# convert to string and handle NaN values
df_course['learning_path'] = df_course['learning_path'].fillna('')
df_course['learning_path'] = df_course['learning_path'].astype(str)
df_course['learning_path'] = df_course['learning_path'].replace('nan', '', regex=False)

def advanced_split_and_clean(text):
    if not isinstance(text, str):
        return []

    text = text.strip()
    extracted_items = []
    remaining_text = text

    # extract known phrases using regex
    found_matches = re.findall(known_paths_pattern, remaining_text, re.IGNORECASE)
    for match in found_matches:
        extracted_items.append(match.strip())
        # replace the found match to avoid re-matching or interfering with further splitting
        remaining_text = re.sub(re.escape(match), '', remaining_text, flags=re.IGNORECASE, count=1)

    remaining_text = remaining_text.replace(',', ' ').replace('/', ' ') # Convert commas/slashes to spaces
    remaining_text = re.sub(r'\s+', ' ', remaining_text).strip() # Replace multiple spaces with single space

    # Split the remaining text by spaces, if any
    if remaining_text:
        # Filter out short, non-meaningful words like 'nan' or single letters
        words_from_remaining = [
            word.strip() for word in remaining_text.split(' ')
            if word.strip() and len(word.strip()) > 1 and word.strip().lower() != 'nan'
        ]
        extracted_items.extend(words_from_remaining)

    # Final cleanup on all extracted items
    cleaned_final_items = []
    for item in extracted_items:
        item = item.strip()
        item = re.sub(r'[\n\r]+', '', item) # Remove newlines
        item = re.sub(r'^\s*,\s*|\s*,\s*$', '', item) # Remove leading/trailing commas
        item = re.sub(r'^,+|,+$', '', item) # Remove leading/trailing commas
        item = item.strip(', ') # Final strip of commas and spaces
        item = re.sub(r'\s+', ' ', item) # Replace multiple spaces with a single space
        if item and item.lower() != 'nan': # Final check for empty or 'nan' string
            cleaned_final_items.append(item)

    return cleaned_final_items

df_course['learning_path'] = df_course['learning_path'].apply(advanced_split_and_clean)

# Flatten the list of lists into a series of individual entries
s = df_course['learning_path'].explode()

# final cleaning on the exploded series 
s = s.str.strip()
s = s.str.replace(r'\s+', ' ', regex=True) # replace multiple spaces with a single space
s = s.str.replace(r'[\n\r]+', '', regex=True) # remaining newline/carriage return chars
s = s.str.replace(r'^\s*,\s*|\s*,\s*$', '', regex=True) # leading/trailing commas with spaces
s = s.str.replace(r'^,+|,+$', '', regex=True) # leading/trailing commas without spaces
s = s.str.strip(', ') # Final strip of commas and spaces

# filter out empty strings and any potential NaN floats
s = s[s != '']
s = s.dropna() # drop any potential NaN values that might have crept in or remained as floats

# get unique values
cleaned_unique_learning_paths = s.unique()

# all items are strings before sorting to prevent TypeError
cleaned_unique_learning_paths = [str(item) for item in cleaned_unique_learning_paths]

print("Cleaned unique values in 'learning_path':")
for item in sorted(cleaned_unique_learning_paths):
    print(f"'{item}'")

print("-" * 50)
print("\nOriginal DataFrame 'learning_path' after initial string and list conversion (before explode):")
print(df_course['learning_path'])

Original unique values in 'learning_path':
['Data Scientist'
 'Data Scientist nan Back-End JavaScript DevOps Engineer React'
 'Android \n' ', , iOS \n' 'Front-End Web \n' ' , React \n'
 'Google Cloud Professional\n' nan 'Front-End Web' 'React'
 'Data Scientist , , Multi-Platform App' 'Front-End Web , React ,' 'AWS'
 'nan' 'Full Stack Developer Backend' 'UI/UX Designer Frontend']
--------------------------------------------------
Cleaned unique values in 'learning_path':
'AWS'
'Android'
'Back-End'
'Backend'
'Data Scientist'
'Designer'
'DevOps Engineer'
'Developer'
'Front-End Web'
'Frontend'
'Full'
'Google Cloud Professional'
'JavaScript'
'Multi-Platform App'
'React'
'Stack'
'UI'
'UX'
'iOS'
--------------------------------------------------

Original DataFrame 'learning_path' after initial string and list conversion (before explode):
0                                      [Data Scientist]
1     [Data Scientist, Back-End, JavaScript, DevOps ...
2                                           