<a href="https://colab.research.google.com/github/mariarodrius/AI-with-Copilot/blob/main/Data_Developer_Salary_in_2024%F0%9F%92%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'data-eng-salary-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5183463%2F8653190%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240618%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240618T081746Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db8b269e77b5d2147733c890a4438939c692d51fe4d60bf4a730dc8cf798fa253033a4ebc5c0206811af208c5cb84bf21e6273c1f49c95314b37bd245137eb5f9a0408268c50e3dc75c3f8e04a8f7c419be121be3a2afbadcd48e38561f4929f5404b6283f6ddcee17219128355ebb62e0b74b12be211fa770f9ff3bed4f09a8925deb1b5e6c81568031f2bd30a4615781d4d7730e8736edeb96677d7515a1fa15202ae0cdf973698d62ae590176a1e6ec8ee49f91e47089d258301226215dd06854bcd99758c972b7d4b1e5fa8257307ed8b9c8198755d95ec4e912805139c21266dd21f849320d85bd9e76f2f018a29f75ed887af14482d31dcf9d62632bca9'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


بسم الله

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/kaggle/input/data-eng-salary-2024/Dataset salary 2024.csv")
df

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns.to_list()

In [None]:
print("DataFrame Info:")
df.info()

In [None]:
print("\nSummary Statistics:")
df.describe().T

In [None]:
df.describe().T.plot(kind='bar')

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()


In [None]:
import plotly.express as px

columns = ['work_year',
 'experience_level',
 'employment_type',
 'job_title',
 'salary',
 'salary_currency',
 'salary_in_usd',
 'employee_residence',
 'remote_ratio',
 'company_location',
 'company_size']

for column in columns:
    fig = px.histogram(data_frame=df, x=column)
    fig.show()

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns  # اختيار الأعمدة الرقمية فقط
plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:

print("Summary Statistics:")
print(df.describe())
print()

print("Unique Values in Categorical Columns:")
for column in df.select_dtypes(include='object').columns:
    print(column + ":", df[column].unique())
    print()

# 2. Work Experience Analysis
plt.figure(figsize=(10, 6))
sns.histplot(df['work_year'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Work Years')
plt.xlabel('Work Years')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='work_year', y='salary', data=df, alpha=0.5)
plt.title('Relationship between Work Years and Salary')
plt.xlabel('Work Years')
plt.ylabel('Salary')
plt.show()

# 3. Experience Level Analysis
plt.figure(figsize=(8, 5))
sns.countplot(x='experience_level', data=df, palette='muted')
plt.title('Distribution of Experience Levels')
plt.xlabel('Experience Level')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='experience_level', y='salary', data=df, palette='muted')
plt.title('Salary Distribution by Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Salary')
plt.show()

# 4. Employment Type Analysis
plt.figure(figsize=(8, 5))
sns.countplot(x='employment_type', data=df, palette='pastel')
plt.title('Distribution of Employment Types')
plt.xlabel('Employment Type')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='employment_type', y='salary', data=df, palette='pastel')
plt.title('Average Salary by Employment Type')
plt.xlabel('Employment Type')
plt.ylabel('Average Salary')
plt.show()

# 5. Job Title Analysis
plt.figure(figsize=(10, 6))
top_job_titles = df['job_title'].value_counts().head(10)
sns.barplot(x=top_job_titles.values, y=top_job_titles.index, palette='Set2')
plt.title('Top 10 Job Titles')
plt.xlabel('Count')
plt.ylabel('Job Title')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='job_title', y='salary', data=df, palette='Set2')
plt.title('Salary Distribution by Job Title')
plt.xlabel('Job Title')
plt.ylabel('Salary')
plt.xticks(rotation=45)
plt.show()

# 6. Salary Analysis
plt.figure(figsize=(10, 6))
sns.histplot(df['salary'], bins=30, kde=True, color='skyblue')
plt.title('Salary Distribution among Data Developers')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.show()

# 7. Salary Currency Analysis
plt.figure(figsize=(8, 5))
sns.countplot(x='salary_currency', data=df, palette='Set3')
plt.title('Distribution of Salary Currencies')
plt.xlabel('Salary Currency')
plt.ylabel('Count')
plt.show()

# 8. Employee Residence Analysis
plt.figure(figsize=(10, 6))
top_employee_residences = df['employee_residence'].value_counts().head(10)
sns.barplot(x=top_employee_residences.values, y=top_employee_residences.index, palette='Blues_r')
plt.title('Top 10 Employee Residences')
plt.xlabel('Count')
plt.ylabel('Residence')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='employee_residence', y='salary', data=df, palette='Blues_r')
plt.title('Salary Distribution by Employee Residence')
plt.xlabel('Employee Residence')
plt.ylabel('Salary')
plt.xticks(rotation=45)
plt.show()

# 9. Remote Work Ratio Analysis
plt.figure(figsize=(10, 6))
sns.histplot(df['remote_ratio'], bins=20, kde=True, color='lightgreen')
plt.title('Distribution of Remote Work Ratio')
plt.xlabel('Remote Work Ratio')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='remote_ratio', y='salary', data=df, alpha=0.5, color='lightgreen')
plt.title('Relationship between Remote Work Ratio and Salary')
plt.xlabel('Remote Work Ratio')
plt.ylabel('Salary')
plt.show()

# 10. Company Location Analysis
plt.figure(figsize=(10, 6))
top_company_locations = df['company_location'].value_counts().head(10)
sns.barplot(x=top_company_locations.values, y=top_company_locations.index, palette='Oranges_r')
plt.title('Top 10 Company Locations')
plt.xlabel('Count')
plt.ylabel('Company Location')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='company_location', y='salary', data=df, palette='Oranges_r')
plt.title('Salary Distribution by Company Location')
plt.xlabel('Company Location')
plt.ylabel('Salary')
plt.xticks(rotation=45)
plt.show()

# 11. Company Size Analysis
plt.figure(figsize=(8, 5))
sns.countplot(x='company_size', data=df, palette='muted')
plt.title('Distribution of Company Sizes')
plt.xlabel('Company Size')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='company_size', y='salary', data=df, palette='muted')
plt.title('Salary Distribution by Company Size')
plt.xlabel('Company Size')
plt.ylabel('Salary')
plt.show()

In [None]:
df.hist(bins = 20, figsize = (20,20), color = 'g')
plt.show()

In [None]:
sns.pairplot(df)

In [None]:
from wordcloud import WordCloud

reviews_text = ' '.join(df['job_title'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Reviews')
plt.axis('off')