In [None]:
!pip install bertopic
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install --upgrade cupy-cuda11x -f https://pip.cupy.dev/aarch64

In [None]:
import pandas as pd
import glob
import os
from google.colab import files
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Upload the file manually
uploaded = files.upload()

df = pd.read_csv("Preprocessed_jobs.csv")

Saving Preprocessed_jobs.csv to Preprocessed_jobs (1).csv


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def standardize_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return " ".join(cleaned_tokens)

df_clean = df.copy()
df_clean['Cleaned Job Description'] = df_clean['Cleaned Job Description'].apply(standardize_text)

In [None]:
# Extracting specific phrases
def extract_specific_phrases(text_list, keywords, ngram_range=(1, 3), n_features=10000, n_top_phrases=50):
    """
    Extract specific n-grams related to provided keywords using TF-IDF.
    """
    # TF-IDF vectorizer focusing on n-grams
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=n_features, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(text_list)
    terms = vectorizer.get_feature_names_out()

    # Filter terms containing any of the specified keywords
    relevant_terms = [term for term in terms if any(keyword in term for keyword in keywords)]

    # Calculate mean TF-IDF score per term and sort by relevance
    scores = tfidf_matrix.mean(axis=0).tolist()[0]
    term_scores = {term: scores[idx] for idx, term in enumerate(terms) if term in relevant_terms}
    sorted_terms = sorted(term_scores.items(), key=lambda x: x[1], reverse=True)

    # Extract top phrases
    top_phrases = [term for term, _ in sorted_terms[:n_top_phrases]]
    return top_phrases

# Define original domain-relevant keywords for focus
domain_keywords = ["data science", "data analysis", "generative ai", "llm", "machine learning", "cloud", "engineering", "processing", "data visualization", "data optimization"]

# Extract refined phrases from job descriptions using the desired method
clean_refined_phrases = extract_specific_phrases(
    df_clean['Cleaned Job Description'].tolist(),
    keywords=domain_keywords,
    ngram_range=(1, 3),
    n_top_phrases=100
)

# Display the refined results
clean_refined_phrases

['machine learning',
 'engineering',
 'generative ai',
 'llm',
 'data science',
 'cloud',
 'processing',
 'language processing',
 'natural language processing',
 'data analysis',
 'machine learning model',
 'software engineering',
 'model llm',
 'language model llm',
 'data engineering',
 'engineering team',
 'science engineering',
 'machine learning engineer',
 'data visualization',
 'cloud platform',
 'data processing',
 'engineering related',
 'computer engineering',
 'ai machine learning',
 'generative ai solution',
 'language processing nlp',
 'processing nlp',
 'computer science engineering',
 'generative ai model',
 'machine learning algorithm',
 'electrical engineering',
 'prompt engineering',
 'experience machine learning',
 'cloud computing',
 'experience generative ai',
 'generative ai technology',
 'machine learning deep',
 'engineering related field',
 'engineering computer',
 'engineering experience',
 'preprocessing',
 'science computer engineering',
 'cloudbased',
 'mac

In [None]:
# Reference set for analysing
reference_set = {
    "Programming": [
        "Python", "R", "SQL", "Java", "C++", "JavaScript", "C#", "MATLAB",
        "Perl", "Ruby", "Go", "Scala", "Swift", "HTML", "CSS", "Dart",
        "Kotlin", "Shell Scripting", "Rust", "TypeScript", "Bash", "Fortran",
        "Lua", "VBScript", "Julia", "Assembly", "F#", "Delphi", "Objective-C",
        "COBOL"
    ],
    "Mathematics & Statistics": [
        "Probability", "Linear Algebra", "Hypothesis Testing",
        "Descriptive Analytics", "Statistical Modeling", "Bayesian Statistics",
        "Monte Carlo Simulation", "Optimization Techniques", "Game Theory",
        "Markov Chains", "Time Series Analysis", "Stochastic Processes",
        "Cluster Analysis", "Principal Component Analysis (PCA)",
        "Dimensionality Reduction", "Numerical Analysis", "Regression Analysis",
        "Variance Analysis", "Matrix Decomposition", "Graph Theory",
        "Probability Distributions", "Sampling Methods", "ANOVA",
        "Non-parametric Statistics", "Factor Analysis", "Spatial Statistics",
        "Quantitative Methods", "Predictive Models", "Statistical Inference",
        "Mathematical Programming"
    ],
    "Machine Learning": [
        "Logistic Regression", "Random Forests", "Neural Networks",
        "Supervised Learning", "Unsupervised Learning", "Reinforcement Learning",
        "Model Deployment", "Predictive Analytics", "Prescriptive Analytics",
        "Support Vector Machines (SVM)", "Gradient Boosting", "XGBoost",
        "CatBoost", "LightGBM", "K-Nearest Neighbors", "Clustering Techniques",
        "AutoML", "Model Optimization", "Deep Reinforcement Learning",
        "GANs (Generative Adversarial Networks)", "Transfer Learning",
        "Hyperparameter Tuning", "Semi-supervised Learning",
        "Anomaly Detection", "Online Learning", "Active Learning",
        "Feature Selection", "Time Series Forecasting", "Bagging",
        "Ensemble Methods"
    ],
    "Data Manipulation": [
        "Data Cleaning", "Feature Engineering", "Data Wrangling",
        "Data Integration", "Data Transformation", "Dimensionality Reduction",
        "Data Sampling", "Data Normalization", "Data Imputation",
        "Pivot Tables", "Data Merging", "Data Aggregation", "Outlier Detection",
        "Reshaping Data", "Slicing and Indexing", "Handling Missing Data",
        "Data Encoding", "One-Hot Encoding", "Binning", "Data Scaling",
        "Data Partitioning", "ETL Processes", "Data Augmentation",
        "Feature Scaling", "Cross Validation", "Data Reduction",
        "Data Balancing", "Data Resampling", "Data Annotation",
        "Data Schema Design"
    ],
    "Data Management": [
        "Data Governance", "Metadata Management", "FAIR Data Principles",
        "Data Provenance", "Data Warehousing", "Data Lakes",
        "Data Curation", "Data Quality Assurance", "Data Stewardship",
        "Master Data Management", "Data Lineage", "ETL Pipelines",
        "Data Versioning", "Backup and Recovery", "Access Control",
        "Data Partitioning", "Schema Evolution", "Distributed Databases",
        "Data Archiving", "Data Replication", "Stream Processing",
        "Data Security", "Data Auditing", "Data Masking",
        "Data Sensitivity Management", "Data Dictionary",
        "Hierarchical Databases", "Relational Databases",
        "Object-Oriented Databases", "Graph Databases"
    ],
    "Generative AI": [
        "Transformer Architectures", "Fine-tuning", "Prompt Engineering",
        "Tokenization", "LLM Applications", "GPT Models", "BERT Models",
        "Zero-Shot Learning", "Few-Shot Learning", "Masked Language Modeling",
        "Seq2Seq Models", "Text-to-Image Generation", "GANs for Images",
        "Diffusion Models", "Natural Language Generation", "Speech Synthesis",
        "Image Captioning", "Audio Generation", "Pretrained Models",
        "Text Summarization", "Knowledge Distillation", "Model Pruning",
        "Language Understanding", "Data Augmentation for AI",
        "Contrastive Learning", "Episodic Memory Models", "Autoregressive Models",
        "Bidirectional Models", "Language Modeling", "Text Classification",
        "Multimodal AI"
    ],
    "Visualization & Storytelling": [
        "Data Visualization", "Dashboard Design", "Data Storytelling",
        "Interactive Visualizations", "Heatmaps", "Choropleth Maps",
        "Scatter Plots", "Bar Charts", "Line Charts", "Pie Charts",
        "Treemaps", "Histograms", "Box Plots", "Waterfall Charts",
        "Bubble Charts", "Word Clouds", "Network Graphs", "Geospatial Visuals",
        "3D Visualizations", "Annotated Charts", "Drill-Down Visuals",
        "Real-Time Dashboards", "Storyboarding", "Infographics",
        "Time-Series Visualizations", "Cross-Filtering", "Multi-View Charts",
        "Visual Encodings", "Interactive Dashboards", "Custom Visuals"
    ],
    "Ethics & Compliance": [
        "Bias Mitigation", "Data Privacy", "Ethical AI", "GDPR Compliance",
        "Responsible AI", "IPR Protection", "Algorithmic Fairness",
        "Transparency in AI", "Explainable AI", "Adversarial Robustness",
        "Cybersecurity", "Data Security", "Informed Consent",
        "AI Regulations", "Data Ownership", "AI Policy Making",
        "Ethical Decision Making", "Sustainable AI",
        "Ethics in Machine Learning", "Fairness Metrics",
        "Ethical Use of Data", "Data Sharing Agreements",
        "Legal Compliance", "Open Data Principles",
        "Bias Testing Tools", "AI Auditing", "Unbiased Datasets",
        "Stakeholder Analysis", "Disparate Impact Analysis",
        "Moral Responsibility"
    ],
    "Business Acumen": [
        "Understanding Business Problems", "Domain Knowledge",
        "Strategic Thinking", "Decision Making",
        "Business Process Management", "Business Analytics",
        "Market Analysis", "Financial Forecasting", "Competitor Analysis",
        "Risk Management", "Customer Insights", "Operational Efficiency",
        "Cost-Benefit Analysis", "Business Continuity Planning",
        "SWOT Analysis", "Stakeholder Mapping", "Value Proposition Design",
        "Revenue Modeling", "Pricing Strategies", "Sales Forecasting",
        "Data-Driven Strategy", "ROI Analysis", "Supply Chain Optimization",
        "Change Management", "Customer Journey Mapping",
        "Business Case Development", "Lean Management", "KPI Management",
        "Balanced Scorecard"
    ],
    "Communication Skills": [
        "Presenting Findings", "Technical Writing", "Stakeholder Engagement",
        "Collaborative Skills", "Cross-disciplinary Communication",
        "Public Speaking", "Negotiation Skills", "Team Leadership",
        "Listening Skills", "Conflict Resolution", "Feedback Giving",
        "Visual Storytelling", "Crisis Communication",
        "Documentation Skills", "Knowledge Sharing", "Empathy in Communication",
        "Persuasion Skills", "Non-Verbal Communication", "Active Listening",
        "Pitching Ideas", "Workshop Facilitation", "Meeting Moderation",
        "Inclusive Communication", "Cultural Sensitivity", "Interpersonal Skills",
        "Brainstorming Techniques", "Storyboarding", "Customer Communication",
        "Clarifying Questions", "Audience Analysis"
    ],
    "Software Engineering": [
        "Version Control", "Testing and Debugging", "Agile Methodologies",
        "API Development", "Big Data Systems Engineering",
        "Microservices Architecture", "Continuous Integration",
        "Continuous Deployment", "Infrastructure as Code",
        "Object-Oriented Programming", "Functional Programming",
        "Design Patterns", "Code Reviews", "Unit Testing",
        "Integration Testing", "Performance Testing",
        "Security Testing", "CI/CD Pipelines", "Software Documentation",
        "Code Optimization", "Embedded Systems", "Software Refactoring",
        "Scalability Engineering", "Software Architecture",
        "Event-Driven Programming", "Multi-threading",
        "Asynchronous Programming", "Concurrency", "Distributed Systems",
        "Cloud-Native Development"
    ],
    "Cloud Computing": [
        "AWS", "Google Cloud Platform", "Microsoft Azure", "Cloud Infrastructure",
        "Data Pipeline Development", "Distributed Systems",
        "Serverless Computing", "Hybrid Cloud", "Private Cloud",
        "Public Cloud", "Cloud Storage", "Infrastructure as a Service (IaaS)",
        "Platform as a Service (PaaS)", "Software as a Service (SaaS)",
        "Cloud Security", "Virtual Machines", "Kubernetes",
        "Docker Containers", "Load Balancing", "Networking in Cloud",
        "Cloud Monitoring", "Cloud Automation", "Cloud Backup Solutions",
        "Disaster Recovery", "Edge Computing", "Cloud Migration",
        "Identity and Access Management (IAM)", "Multi-Cloud Strategies",
        "Resource Scaling", "Cloud Optimization"
    ],
    "Research Methods & Project Management": [
        "Research Hypothesis Formulation", "Experiment Design",
        "Project Planning", "Team Collaboration", "Research Reproducibility",
        "Open Science Principles", "Scientific Data Lifecycle Management",
        "Risk Assessment", "Stakeholder Management", "Milestone Tracking",
        "Deliverable Planning", "Agile Project Management",
        "Waterfall Project Management", "SCRUM Framework",
        "Kanban Boards", "Gantt Charts", "Critical Path Method",
        "Resource Allocation", "Project Reporting", "Task Prioritization",
        "Budgeting", "Progress Monitoring", "Project Scope Management",
        "Change Requests", "Project Documentation", "Project Closure",
        "Project Handover", "Iterative Development", "Time Management",
        "Team Retrospectives"
    ],
    "Big Data Technologies": [
        "Hadoop", "Spark", "Kafka", "MapReduce", "TensorFlow", "Scikit-learn",
        "Data Stream Processing", "High-Performance Networks", "Pig",
        "Hive", "Flume", "HBase", "Cassandra", "Flink",
        "Dask", "NiFi", "BigQuery", "Dataflow", "Storm",
        "ElasticSearch", "Snowflake", "Redshift", "Presto", "Delta Lake",
        "Vertica", "Kudu", "Mesos", "Impala", "Zookeeper",
        "Bigtable", "Oozie", "Sqoop", "AWS Glue", "Google Data Studio"
    ],
    "Domain-Specific Knowledge": [
        "Healthcare Analytics", "Finance Analytics", "E-commerce Analytics",
        "Government Data Systems", "Open Data Utilization", "IoT Analytics",
        "Social Media Analytics", "Marketing Analytics", "Energy Analytics",
        "Telecommunications Analytics", "Retail Analytics",
        "Supply Chain Analytics", "Education Analytics", "Transportation Analytics",
        "Climate Data Analysis", "Agriculture Analytics", "Real Estate Analytics",
        "Geospatial Data Analysis", "Cybersecurity Analytics",
        "Sports Analytics", "Entertainment Analytics", "Automotive Analytics",
        "Travel Analytics", "Manufacturing Analytics", "Legal Analytics",
        "Insurance Analytics", "Gaming Analytics", "Aviation Analytics",
        "Military Data Analysis", "Smart City Data Analysis"
    ]
}

In [None]:
# Flatten the expanded reference set into a single list of competences for analysis
expanded_competences = [skill.lower() for skills in reference_set.values() for skill in skills]

# Initialize a Counter to track skill mentions using the expanded reference set
expanded_skill_counter = Counter()

# Iterate through the cleaned job descriptions and count mentions of expanded competences
for description in df["Cleaned Job Description"].dropna():
    for skill in expanded_competences:
        if re.search(rf'\b{re.escape(skill)}\b', description, re.IGNORECASE):
            expanded_skill_counter[skill] += 1

# Convert the results to a DataFrame for better visualization
expanded_competence_df = pd.DataFrame(expanded_skill_counter.items(), columns=["Competence", "Frequency"])
expanded_competence_df = expanded_competence_df.sort_values(by="Frequency", ascending=False)

In [None]:
expanded_competence_df

Unnamed: 0,Competence,Frequency
0,python,277
3,tensorflow,120
25,sql,109
7,aws,104
45,r,66
...,...,...
152,lightgbm,1
153,data normalization,1
155,data lineage,1
157,performance testing,1


Update the reference set with word2vec, it finds similar words for each competence in the reference set

In [None]:
# Preprocess the job descriptions for Word2Vec
job_descriptions = df_clean["Cleaned Job Description"].dropna().tolist()
tokenized_descriptions = [simple_preprocess(description) for description in job_descriptions]

# Train a Word2Vec model on the job descriptions
w2v_model = Word2Vec(sentences=tokenized_descriptions, vector_size=100, window=5, min_count=2, workers=4, epochs=10)

# Function to find similar words for a given word using Word2Vec
def find_similar_words(word, topn=5):
    try:
        return [sim_word for sim_word, _ in w2v_model.wv.most_similar(word, topn=topn)]
    except KeyError:
        return []  # Return empty if the word is not in the vocabulary

# Generate similar words for each competence in the reference set
expanded_competences_with_synonyms = set(expanded_competences)  # Start with the existing competences
for skill in expanded_competences:
    similar_words = find_similar_words(skill.lower(), topn=3)  # Find 3 most similar words
    expanded_competences_with_synonyms.update(similar_words)

# Convert the set back to a list for further processing
expanded_competences_with_synonyms = list(expanded_competences_with_synonyms)

# Remove high-level categories from the expanded competence set
high_level_categories = {
    "Programming", "Mathematics & Statistics", "Machine Learning",
    "Data Manipulation", "Data Management", "Generative AI",
    "Visualization & Storytelling", "Ethics & Compliance", "Business Acumen",
    "Communication Skills", "Software Engineering", "Cloud Computing",
    "Research Methods & Project Management", "Big Data Technologies",
    "Domain-Specific Knowledge"
}

# Filter out high-level categories from the expanded set
filtered_expanded_competences = [
    term for term in expanded_competences_with_synonyms
    if term.lower() not in [category.lower() for category in high_level_categories]
]

# Display a subset of the filtered expanded competences
filtered_expanded_competences[:50]  # Display first 50 terms for review

['infographics',
 'etl processes',
 'unbiased datasets',
 'autoregressive models',
 'multi-cloud strategies',
 'algebra',
 'css',
 'language modeling',
 'html',
 'redshift',
 'microservices architecture',
 'telecommunications analytics',
 'stakeholder management',
 'graph',
 'linear',
 'bar charts',
 'bubble charts',
 'rust',
 'dart',
 'word clouds',
 'edge computing',
 'testing and debugging',
 'access control',
 'business process management',
 'software documentation',
 'bias testing tools',
 'supply chain optimization',
 'regression analysis',
 'data security',
 'transparency in ai',
 'bert models',
 'graph theory',
 'value proposition design',
 'multimodal ai',
 'deepspeed',
 'box plots',
 'data stream processing',
 'database',
 'anomaly detection',
 'relational databases',
 'data sensitivity management',
 'data pipeline development',
 'data schema design',
 'vbscript',
 'roi analysis',
 'perl',
 'histograms',
 'model optimization',
 'fusion',
 'zero-shot learning']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def categorize_phrases(reference_set, filtered_expanded_competences, clean_refined_phrases):
  """Categorizes phrases based on cosine similarity to existing categories."""

  vectorizer = TfidfVectorizer()
  # Combine all terms for vectorization
  all_terms = list(reference_set.keys()) + filtered_expanded_competences + clean_refined_phrases
  tfidf_matrix = vectorizer.fit_transform(all_terms)

  # Calculate cosine similarity between new phrases and existing categories
  reference_indices = {category: index for index, category in enumerate(reference_set.keys())}
  new_phrase_indices = {phrase: index + len(reference_set) for index, phrase in enumerate(filtered_expanded_competences + clean_refined_phrases)}

  categorized_phrases = {}
  for phrase, index in new_phrase_indices.items():
    similarities = cosine_similarity(tfidf_matrix[index], tfidf_matrix[:len(reference_set)])

    # Convert reference_set.keys() to a list to make it subscriptable
    best_category = list(reference_set.keys())[similarities.argmax()]

    if phrase not in categorized_phrases:
      categorized_phrases[phrase] = []
    categorized_phrases[phrase].append(best_category) # Append the most similar category

  # Update the reference_set
  updated_reference_set = reference_set.copy()
  for phrase, categories in categorized_phrases.items():
    best_match = categories[0]  # Taking the first (most similar) category
    if phrase not in updated_reference_set[best_match]:
      updated_reference_set[best_match].append(phrase)

  return updated_reference_set

In [None]:
# Example usage (assuming you have defined reference_set, filtered_expanded_competences, and clean_refined_phrases)
updated_reference_set = categorize_phrases(reference_set, filtered_expanded_competences, clean_refined_phrases)
updated_reference_set

{'Programming': ['Python',
  'R',
  'SQL',
  'Java',
  'C++',
  'JavaScript',
  'C#',
  'MATLAB',
  'Perl',
  'Ruby',
  'Go',
  'Scala',
  'Swift',
  'HTML',
  'CSS',
  'Dart',
  'Kotlin',
  'Shell Scripting',
  'Rust',
  'TypeScript',
  'Bash',
  'Fortran',
  'Lua',
  'VBScript',
  'Julia',
  'Assembly',
  'F#',
  'Delphi',
  'Objective-C',
  'COBOL',
  'infographics',
  'etl processes',
  'unbiased datasets',
  'autoregressive models',
  'algebra',
  'css',
  'language modeling',
  'html',
  'redshift',
  'microservices architecture',
  'telecommunications analytics',
  'graph',
  'linear',
  'bar charts',
  'bubble charts',
  'rust',
  'dart',
  'word clouds',
  'testing and debugging',
  'access control',
  'bias testing tools',
  'supply chain optimization',
  'regression analysis',
  'bert models',
  'graph theory',
  'value proposition design',
  'deepspeed',
  'box plots',
  'database',
  'anomaly detection',
  'relational databases',
  'vbscript',
  'roi analysis',
  'perl',
 

In [None]:
# download the results of updated_reference_set
import pandas as pd
from google.colab import files

# Convert the dictionary to a DataFrame
updated_reference_df = pd.DataFrame(list(updated_reference_set.items()), columns=['Category', 'Skills'])

# Save the DataFrame to a CSV file
updated_reference_df.to_csv('updated_reference_set.csv', index=False)

# Download the CSV file
files.download('updated_reference_set.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>