<a href="https://colab.research.google.com/github/paranthaman76/ai-job-dataset/blob/main/Data_Analysis_using_Numpy_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/ai_job_dataset.csv')

# Identify numerical columns

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns identified:", numerical_cols)

# Extract numerical data as a NumPy array

numerical_data = df[numerical_cols].values
print("\nShape of the numerical NumPy array:", numerical_data.shape)

print("\n--- Advanced NumPy Numerical Calculations ---")

# Variance of each column

variance_values = np.var(numerical_data, axis=0)
print("\n1. Variance of each numerical column:")
for i, col in enumerate(numerical_cols):
    print(f"- {col}: {variance_values[i]:.2f}")
print("Syntax: np.var(array, axis=0) - 'axis=0' for column-wise variance.")

# Percentiles of each column (e.g., 20th, 40th (median), 55th)

percentiles_to_calculate = [20, 40, 55]
print(f"\n2. Percentiles ({percentiles_to_calculate}) of each numerical column:")
for col_idx, col_name in enumerate(numerical_cols):
    print(f"  - {col_name}:")
    for q in percentiles_to_calculate:
        percentile_val = np.percentile(numerical_data[:, col_idx], q)
        print(f"    - {q}th percentile: {percentile_val:.2f}")
print("Syntax: np.percentile(array, q, axis=None) - 'q' is the percentile(s) to compute.")
print("  Note: Using numerical_data[:, col_idx] to select a single column's data.")


# Covariance Matrix

covariance_matrix = np.cov(numerical_data, rowvar=False)
print("\n3. Covariance Matrix of numerical columns:")

# For better readability, convert to DataFrame with column names

covariance_df = pd.DataFrame(covariance_matrix, index=numerical_cols, columns=numerical_cols)
print(covariance_df)

print("Syntax: np.cov(array, rowvar=False) - 'rowvar=False' implies columns are variables.")

# Array Manipulation: Transposing

transposed_data = numerical_data.T
print(f"\n4. Transposed numerical data (first 6 rows/cols of transposed):")
print(transposed_data[:, :6])
print(f"Original shape: {numerical_data.shape}, Transposed shape: {transposed_data.shape}")
print("Syntax: array.T or np.transpose(array) - Swaps axes.")

# Broadcasting Example: Adding a scalar to the entire array

scalar_to_add = 1000
broadcast_addition = numerical_data + scalar_to_add
print(f"\n5. Broadcasting example: Adding {scalar_to_add} to all elements (first 5 rows):")
print(broadcast_addition[:5])
print("Syntax: array + scalar - Scalar is broadcast to all elements.")

# Universal Functions (Ufuncs): Element-wise Logarithm (natural log)

log_salary = np.log(df['salary_usd'].values)
print("\n6. Universal Function (Ufunc) Example: Natural logarithm of 'salary_usd' (first 10 values):")
print(log_salary[:10])
print("Syntax: np.log(array) - Applies natural logarithm element-wise.")

# Unique values and counts for 'remote_ratio' using np.unique

unique_remote_ratios, counts_remote_ratios = np.unique(df['remote_ratio'].values, return_counts=True)
print("\n7. Unique values and counts for 'remote_ratio' using np.unique:")
for val, count in zip(unique_remote_ratios, counts_remote_ratios):
    print(f"- Remote Ratio {val}: {count} occurrences")
print("Syntax: np.unique(array, return_counts=True) - Returns unique elements and their frequencies.")


Numerical columns identified: ['salary_usd', 'remote_ratio', 'years_experience', 'job_description_length', 'benefits_score']

Shape of the numerical NumPy array: (15000, 5)

--- Advanced NumPy Numerical Calculations ---

1. Variance of each numerical column:
- salary_usd: 3631138850.44
- remote_ratio: 1665.57
- years_experience: 30.75
- job_description_length: 331900.29
- benefits_score: 2.10
Syntax: np.var(array, axis=0) - 'axis=0' for column-wise variance.

2. Percentiles ([20, 40, 55]) of each numerical column:
  - salary_usd:
    - 20th percentile: 65025.60
    - 40th percentile: 87595.60
    - 55th percentile: 107011.70
  - remote_ratio:
    - 20th percentile: 0.00
    - 40th percentile: 50.00
    - 55th percentile: 50.00
  - years_experience:
    - 20th percentile: 1.00
    - 40th percentile: 3.00
    - 55th percentile: 5.00
  - job_description_length:
    - 20th percentile: 908.80
    - 40th percentile: 1303.00
    - 55th percentile: 1609.45
  - benefits_score:
    - 20th percen