In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_karkidi_jobs(keyword="data science", pages=1):
    headers = {'User-Agent': 'Mozilla/5.0'}
    base_url = "https://www.karkidi.com/Find-Jobs/{page}/all/India?search={query}"
    jobs_list = []

    for page in range(1, pages + 1):
        url = base_url.format(page=page, query=keyword.replace(' ', '%20'))
        print(f"Scraping page: {page}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")

        job_blocks = soup.find_all("div", class_="ads-details")
        for job in job_blocks:
            try:
                title = job.find("h4").get_text(strip=True)
                company = job.find("a", href=lambda x: x and "Employer-Profile" in x).get_text(strip=True)
                location = job.find("p").get_text(strip=True)
                experience = job.find("p", class_="emp-exp").get_text(strip=True)
                key_skills_tag = job.find("span", string="Key Skills")
                skills = key_skills_tag.find_next("p").get_text(strip=True) if key_skills_tag else ""
                summary_tag = job.find("span", string="Summary")
                summary = summary_tag.find_next("p").get_text(strip=True) if summary_tag else ""

                jobs_list.append({
                    "Title": title,
                    "Company": company,
                    "Location": location,
                    "Experience": experience,
                    "Summary": summary,
                    "Skills": skills
                })
            except Exception as e:
                print(f"Error parsing job block: {e}")
                continue

        time.sleep(1)  # Be nice to the server

    return pd.DataFrame(jobs_list)

# Example use:
if __name__ == "__main__":
    df_jobs = scrape_karkidi_jobs(keyword="data science", pages=2)
    print(df_jobs.head())
    # Add this line to save the DataFrame to a CSV file
    df_jobs.to_csv("karkidi_jobs.csv", index=False)

Scraping page: 1
Scraping page: 2
                                               Title         Company  \
0          Machine Learning Physical Design Engineer          Google   
1  Staff Software Engineer - Monetization, Poe (R...     Quora, Inc.   
2  Staff Backend Engineer - Bot Creator Ecosystem...     Quora, Inc.   
3  Senior Backend Engineer - Bot Creator Ecosyste...     Quora, Inc.   
4                         Data Scientist Lead - AIML  JPMorgan Chase   

                      Location Experience  \
0  Bengaluru, Karnataka, India   4-6 year   
1                        India  8-10 year   
2                        India  8-10 year   
3                        India   6-8 year   
4  Bengaluru, Karnataka, India   6-8 year   

                                             Summary  \
0  Minimum qualifications:Bachelor's degree in El...   
1  About Quora:Quora’s mission is to grow and sha...   
2  About Quora:Quora’s mission is to grow and sha...   
3  About Quora:Quora’s mission is to g

In [6]:
# Convert skills to lowercase, strip spaces
def clean_skills(skill_str):
    if pd.isnull(skill_str):
        return ""
    # Ensure skill_str is a string before splitting
    if isinstance(skill_str, str):
        skills = [skill.strip().lower() for skill in skill_str.split(',')]
        return ' '.join(skills)  # join with space for vectorization
    else:
        return "" # Handle non-string input gracefully

# Use df_jobs instead of df
df_jobs['cleaned_skills'] = df_jobs['Skills'].apply(clean_skills)

# You can now display or save the updated DataFrame
print(df_jobs.head())

                                               Title         Company  \
0          Machine Learning Physical Design Engineer          Google   
1  Staff Software Engineer - Monetization, Poe (R...     Quora, Inc.   
2  Staff Backend Engineer - Bot Creator Ecosystem...     Quora, Inc.   
3  Senior Backend Engineer - Bot Creator Ecosyste...     Quora, Inc.   
4                         Data Scientist Lead - AIML  JPMorgan Chase   

                      Location Experience  \
0  Bengaluru, Karnataka, India   4-6 year   
1                        India  8-10 year   
2                        India  8-10 year   
3                        India   6-8 year   
4  Bengaluru, Karnataka, India   6-8 year   

                                             Summary  \
0  Minimum qualifications:Bachelor's degree in El...   
1  About Quora:Quora’s mission is to grow and sha...   
2  About Quora:Quora’s mission is to grow and sha...   
3  About Quora:Quora’s mission is to grow and sha...   
4  We have an op

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform cleaned skill text into TF-IDF matrix
# Use df_jobs instead of df
X = vectorizer.fit_transform(df_jobs['cleaned_skills'])

# Optional: view feature names (skills)
print(vectorizer.get_feature_names_out())

['aartificial' 'algorithms' 'amazon' 'analysis' 'analytical' 'and'
 'apache' 'api' 'aws' 'azure' 'bigquery' 'cloud' 'communication' 'cycle'
 'data' 'database' 'design' 'effective' 'employee' 'gcp' 'google'
 'graphql' 'hadoop' 'intelligence' 'js' 'k8s' 'kubernetes' 'language'
 'large' 'leadership' 'learning' 'life' 'llms' 'machine' 'mlops' 'models'
 'natural' 'next' 'nlp' 'optimization' 'platform' 'problem' 'processing'
 'programming' 'python' 'react' 'redshift' 'science' 'skill' 'skills'
 'solving' 'sql' 'structuring' 'teamwork' 'techniques' 'tools'
 'typescript']


In [9]:
import joblib

# Save vectorized features and vectorizer for later reuse
joblib.dump(vectorizer, 'skills_vectorizer.pkl')
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()).to_csv("skills_tfidf_matrix.csv", index=False)


In [11]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Assume `X` is the TF-IDF matrix from step 2
# Try different cluster numbers to find the best one
best_k = 0
best_score = -1

for k in range(2, 11):  # Try cluster sizes from 2 to 10
    kmeans = KMeans(n_clusters=k, random_state=42)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Assume `X` is the TF-IDF matrix from step 2
# Try different cluster numbers to find the best one
best_k = 0
best_score = -1

for k in range(2, 11):  # Try cluster sizes from 2 to 10
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    print(f"k={k}, silhouette score={score}")

    if score > best_score:
        best_score = score
        best_k = k

print(f"Best k: {best_k} with silhouette score: {best_score}")

k=2, silhouette score=0.2307726709693918
k=3, silhouette score=0.3250572727174708
k=4, silhouette score=0.4347476118666573
k=5, silhouette score=0.5520180165735169
k=6, silhouette score=0.6793479939977749
k=7, silhouette score=0.8025064335380818
k=8, silhouette score=0.9384450192702246
k=9, silhouette score=1.0
k=10, silhouette score=1.0
Best k: 9 with silhouette score: 1.0


  return fit_method(estimator, *args, **kwargs)


In [13]:
# Final KMeans model
kmeans = KMeans(n_clusters=best_k, random_state=42)
# Use df_jobs instead of df to add the cluster column
df_jobs['cluster'] = kmeans.fit_predict(X)

In [15]:
# Check most common skills per cluster
for i in range(best_k):
    print(f"\n--- Cluster {i} ---")
    # Use df_jobs instead of df
    cluster_jobs = df_jobs[df_jobs['cluster'] == i]
    all_skills = ' '.join(cluster_jobs['cleaned_skills'])
    from collections import Counter
    skill_counts = Counter(all_skills.split())
    top_skills = skill_counts.most_common(10)
    for skill, count in top_skills:
        print(f"{skill}: {count}")


--- Cluster 0 ---
language: 4
aartificial: 2
intelligence: 2
large: 2
models: 2
-: 2
llms: 2
machine: 2
learning: 2
techniques: 2

--- Cluster 1 ---
aartificial: 4
intelligence: 4
api: 4
data: 4
science: 4
techniques: 4
design: 4
effective: 4
communication: 4
skills: 4

--- Cluster 2 ---
aartificial: 2
intelligence: 2
aws: 2
azure: 2
google: 2
cloud: 2
platform: 2
(gcp): 2
kubernetes-k8s: 2
large: 2

--- Cluster 3 ---
design: 2
leadership: 2
skill: 2
machine: 2
learning: 2
techniques: 2

--- Cluster 4 ---
google: 4
programming: 4
amazon: 2
redshift: 2
apache: 2
hadoop: 2
data: 2
science: 2
techniques: 2
bigquery: 2

--- Cluster 5 ---
aartificial: 2
intelligence: 2
algorithms: 2
data: 2
structuring: 2
design: 2
machine: 2
learning: 2
techniques: 2

--- Cluster 6 ---
algorithms: 2
employee: 2
life: 2
cycle: 2
kubernetes-k8s: 2
large: 2
language: 2
models: 2
-: 2
llms: 2

--- Cluster 7 ---
aartificial: 2
intelligence: 2
analytical: 2
and: 2
problem: 2
solving: 2
api: 2
data: 2
analysis: 

In [17]:
import joblib

# Save the clustering model
joblib.dump(kmeans, 'karkidi_cluster_model.pkl')

# Save the labeled data
# Use df_jobs instead of df to save the correct DataFrame
df_jobs.to_csv("karkidi_clustered_jobs.csv", index=False)

In [18]:
import joblib

# Load the saved TF-IDF vectorizer and KMeans clustering model
vectorizer = joblib.load('skills_vectorizer.pkl')
kmeans = joblib.load('karkidi_cluster_model.pkl')


In [20]:
# Reuse the cleaning function from earlier
import pandas as pd # Import pandas if not already imported

def clean_skills(skill_str):
    if pd.isnull(skill_str):
        return ""
    # Ensure skill_str is a string before splitting
    if isinstance(skill_str, str):
        skills = [skill.strip().lower() for skill in skill_str.split(',')]
        return ' '.join(skills)
    else:
        return "" # Handle non-string input gracefully


# --- Add the following code to create a sample new_jobs_df ---
# In a real application, you would load your new data here
data = {'skills': ['python, machine learning, data science', 'java, spring, hibernate', 'data analysis, sql, excel']}
new_jobs_df = pd.DataFrame(data)
# ------------------------------------------------------------


new_jobs_df['cleaned_skills'] = new_jobs_df['skills'].apply(clean_skills)

# Convert new job skills into TF-IDF vectors
X_new = vectorizer.transform(new_jobs_df['cleaned_skills'])

# Predict clusters for new jobs
new_jobs_df['cluster'] = kmeans.predict(X_new)

# Optional: Display the new DataFrame with cluster assignments
print(new_jobs_df)

                                   skills  \
0  python, machine learning, data science   
1                 java, spring, hibernate   
2               data analysis, sql, excel   

                         cleaned_skills  cluster  
0  python machine learning data science        8  
1                 java spring hibernate        1  
2               data analysis sql excel        4  


In [21]:
# Save for reference or further processing
new_jobs_df.to_csv("new_classified_jobs.csv", index=False)


In [23]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [26]:
user_preferences = {
    "alice@example.com": [0, 2],
    "bob@example.com": [1],
}


In [28]:
# Reuse the cleaning function from earlier
import pandas as pd # Import pandas if not already imported

def clean_skills(skill_str):
    if pd.isnull(skill_str):
        return ""
    # Ensure skill_str is a string before splitting
    if isinstance(skill_str, str):
        skills = [skill.strip().lower() for skill in skill_str.split(',')]
        return ' '.join(skills)
    else:
        return "" # Handle non-string input gracefully


# --- Add the following code to create a sample new_jobs_df ---
# In a real application, you would load your new data here from the scraper
# For demonstration, let's create a sample DataFrame that mimics the structure
# of the scraped data, including 'Title', 'Company', and 'Skills'.
data = {
    'Title': ['Data Scientist', 'Java Developer', 'Data Analyst'],
    'Company': ['Tech Solutions', 'Code Masters', 'Data Insights'],
    'Skills': ['python, machine learning, data science', 'java, spring, hibernate', 'data analysis, sql, excel']
}
new_jobs_df = pd.DataFrame(data)
# ------------------------------------------------------------


new_jobs_df['cleaned_skills'] = new_jobs_df['Skills'].apply(clean_skills)

# Convert new job skills into TF-IDF vectors
X_new = vectorizer.transform(new_jobs_df['cleaned_skills'])

# Predict clusters for new jobs
new_jobs_df['cluster'] = kmeans.predict(X_new)

# Optional: Display the new DataFrame with cluster assignments
print(new_jobs_df)

            Title         Company                                  Skills  \
0  Data Scientist  Tech Solutions  python, machine learning, data science   
1  Java Developer    Code Masters                 java, spring, hibernate   
2    Data Analyst   Data Insights               data analysis, sql, excel   

                         cleaned_skills  cluster  
0  python machine learning data science        8  
1                 java spring hibernate        1  
2               data analysis sql excel        4  


In [33]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.45.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [35]:
# 🛠 Create the app.py file inside Colab

%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load models
vectorizer = joblib.load("skills_vectorizer.pkl")
kmeans = joblib.load("karkidi_cluster_model.pkl")

# Load clustered job data
job_data = pd.read_csv("karkidi_clustered_jobs.csv")

# Title
st.set_page_config(page_title="Karkidi Job Recommender", layout="wide")
st.title("🔍 Job Recommender Based on Your Skills")

# Login Simulation via GitHub (done automatically via Streamlit sharing)
st.markdown("**Logged in via GitHub (Streamlit Sharing handles this automatically).**")

# User skill input
user_input = st.text_input("Enter your skills (comma separated):", "python, machine learning, sql")

if st.button("Find Matching Jobs"):
    def clean_skills(skill_str):
        skills = [skill.strip().lower() for skill in skill_str.split(',')]
        return ' '.join(skills)

    cleaned = clean_skills(user_input)
    user_vector = vectorizer.transform([cleaned])
    user_cluster = kmeans.predict(user_vector)[0]

    st.success(f"📌 Based on your skills, you match **Cluster {user_cluster}**.")

    matching_jobs = job_data[job_data['cluster'] == user_cluster]
    st.subheader(f"📄 Found {len(matching_jobs)} matching jobs:")

    for _, row in matching_jobs.iterrows():
        st.markdown(f"""
        ---
        ### {row['Title']}
        - **Company**: {row['Company']}
        - **Location**: {row['Location']}
        - **Experience**: {row['Experience']}
        - **Skills**: {row['Skills']}
        - **Summary**: {row['Summary']}
        """)



Writing app.py
