## Setup - Install Required Packages

In [None]:
!pip install psycopg2-binary sqlalchemy matplotlib seaborn plotly -q

## Option 1: Use SQLite (No PostgreSQL needed)
Uncomment this section if you want to use SQLite instead of PostgreSQL

In [None]:
# import pandas as pd
# import sqlite3
# from sqlalchemy import create_engine

# # Upload your CSV files when prompted
# from google.colab import files
# print("Upload data_jobs_cleaned.csv:")
# uploaded = files.upload()

# # Create SQLite database
# engine = create_engine('sqlite:///ds_jobs.db')
# conn = sqlite3.connect('ds_jobs.db')

## Option 2: Connect to Cloud PostgreSQL
Use this if you have a cloud PostgreSQL database (e.g., ElephantSQL, Supabase, AWS RDS)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import psycopg2
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')

# Set plot style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    plt.style.use('seaborn-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Database connection - ENTER YOUR CREDENTIALS HERE
DB_CONFIG = {
    'dbname': 'ds_jobs',
    'user': 'postgres',
    'password': 'tiger',  # Change this
    'host': 'localhost',  # Change to your cloud DB host
    'port': 5432
}

# For Colab with ngrok tunnel to local DB:
# DB_CONFIG['host'] = 'your-ngrok-url.ngrok.io'
# DB_CONFIG['port'] = your_port

engine = create_engine(f'postgresql://{DB_CONFIG["user"]}:{DB_CONFIG["password"]}@{DB_CONFIG["host"]}:{DB_CONFIG["port"]}/{DB_CONFIG["dbname"]}')
conn = psycopg2.connect(**DB_CONFIG)
print("✓ Database connection successful!")

## Option 3: Load Data from CSV Files Directly
**RECOMMENDED FOR COLAB** - Upload your data files and analyze without database

In [None]:
# Upload files from your computer
from google.colab import files
import io

print("Upload the data files from your local project:")
print("1. Navigate to: D:/yogpro/Improve-Your-Hiring-Chance-For-Data-Science-Related-Roles-main/data/")
print("2. Upload: data_jobs_cleaned.csv")
uploaded = files.upload()

# Load the data
df = pd.read_csv('data_jobs_cleaned.csv')
print(f"✓ Loaded {len(df)} job records")
df.head()

## Analyze Data Directly from DataFrame

In [None]:
# If using CSV (Option 3), process skills data
import json

# Parse skills
df['skill_type_json'] = df['job_type_skills'].str.replace("'", '"')

# Extract all skills
all_skills = []
for idx, row in df.iterrows():
    try:
        skills_dict = json.loads(row['skill_type_json'])
        for category, skills in skills_dict.items():
            if skills:
                for skill in skills:
                    all_skills.append({'skill': skill, 'category': category})
    except:
        continue

skills_df = pd.DataFrame(all_skills)
print(f"✓ Extracted {len(skills_df)} skill entries")

## Q1. Top 20 Most In-Demand Skills

In [None]:
# Count skills
top20_skills = skills_df['skill'].value_counts().head(20).reset_index()
top20_skills.columns = ['skill', 'count']

# Visualize
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(top20_skills['skill'], top20_skills['count'], color='firebrick')
ax.set_xlabel('Frequency', fontsize=12)
ax.set_ylabel('Skill', fontsize=12)
ax.set_title('Top 20 Skills', fontsize=14, fontweight='bold')
ax.invert_yaxis()

for i, (skill, count) in enumerate(zip(top20_skills['skill'], top20_skills['count'])):
    ax.text(count + 0.5, i, str(count), va='center', fontsize=9)

plt.tight_layout()
plt.show()

top20_skills

## Q2. Median Salary by Job Title

In [None]:
# Calculate median salaries
median_salaries = df.groupby('job_title_short')['salary_year_avg'].median().reset_index()
median_salaries.columns = ['job_title_short', 'med_salary']
median_salaries = median_salaries.sort_values('med_salary', ascending=False)

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(median_salaries['job_title_short'], median_salaries['med_salary'], color='firebrick')
ax.set_xlabel('Median Salary (USD)', fontsize=12)
ax.set_ylabel('Job Title', fontsize=12)
ax.set_title('Median Salary for Each Role', fontsize=14, fontweight='bold')
ax.invert_yaxis()

formatter = FuncFormatter(lambda x, pos: f'${x/1000:.0f}K')
ax.xaxis.set_major_formatter(formatter)

for i, (title, salary) in enumerate(zip(median_salaries['job_title_short'], median_salaries['med_salary'])):
    ax.text(salary + 1000, i, f'${salary:,.0f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

median_salaries

## Q3. Skills by Category

In [None]:
# Top skills by category
top_by_cat = skills_df.groupby(['category', 'skill']).size().reset_index(name='count')
top_by_cat = top_by_cat.sort_values(['category', 'count'], ascending=[True, False])
top10_by_cat = top_by_cat.groupby('category').head(10)

# Visualize
categories = top10_by_cat['category'].unique()
n_categories = len(categories)

fig, axes = plt.subplots(nrows=(n_categories + 1) // 2, ncols=2, figsize=(14, 5 * ((n_categories + 1) // 2)))
axes = axes.flatten()

colors = cm.tab10(range(n_categories))

for idx, category in enumerate(categories):
    cat_data = top10_by_cat[top10_by_cat['category'] == category].sort_values('count', ascending=True)
    
    axes[idx].barh(cat_data['skill'], cat_data['count'], color=colors[idx])
    axes[idx].set_xlabel('Count', fontsize=10)
    axes[idx].set_title(f'{category}', fontsize=11, fontweight='bold')
    
    for i, (skill, count) in enumerate(zip(cat_data['skill'], cat_data['count'])):
        axes[idx].text(count + 0.2, i, str(count), va='center', fontsize=8)

for idx in range(n_categories, len(axes)):
    axes[idx].axis('off')

plt.suptitle('Top 10 Skills by Category', fontsize=16, fontweight='bold', y=1.0)
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("=" * 50)
print("DATA SCIENCE JOBS ANALYSIS SUMMARY")
print("=" * 50)
print(f"\nTotal Job Postings: {len(df)}")
print(f"Unique Skills: {skills_df['skill'].nunique()}")
print(f"Skill Categories: {skills_df['category'].nunique()}")
print(f"\nSalary Statistics:")
print(f"  Average: ${df['salary_year_avg'].mean():,.0f}")
print(f"  Median: ${df['salary_year_avg'].median():,.0f}")
print(f"  Min: ${df['salary_year_avg'].min():,.0f}")
print(f"  Max: ${df['salary_year_avg'].max():,.0f}")
print(f"\nTop 5 Most Demanded Skills:")
for i, (skill, count) in enumerate(zip(top20_skills['skill'].head(5), top20_skills['count'].head(5)), 1):
    print(f"  {i}. {skill}: {count} jobs")
print("\n" + "=" * 50)