<a href="https://colab.research.google.com/github/lucifer-1947/jio-data/blob/main/assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Data Loading and Preprocessing




In [None]:
#1. Loading the dataset from the given CSV file into a Pandas Data Frame.

import pandas as pd #importing pandas package module as pd

#using pandas methods to read csv files
#dataset.csv file is in the root-directory
dataFrame = pd.read_csv('https://drive.google.com/file/d/1MFj8jTbSKOLfmfkIFzKjvI0wz5cCHRd4/view?usp=sharing')


In [None]:
#2. Performing basic data preprocessing steps

#region handling missing values
print(dataFrame.isnull().sum())

#if no.of missing content is less , we can drop them .
dataFrame.dropna()

#But missing contetnt is large , we need to fill by choosing necessary way like average value of the column  , probability based etc...
#dataFrame.fillna(0)

#endregion

#region handling duplicates
print("\nduplicated :",dataFrame.duplicated().sum())

#since we don't need duplicates in the dataframe we can drop any deplicates.
dataFrame.drop_duplicates()

#endregion


In [None]:
#3. Displaying the first few rows of the cleaned dataset.
print(dataFrame)

# Task 2: Exploratory Data Analysis


In [None]:
#Data Visualization using data-visulization library matplotlib
import matplotlib.pyplot as plt

#creating 2x2 grid of subplots , which has 4 grids each grid used for each column representation
#each plot represents the count of each data
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Plot 1: jobs (Bar Chart)
jobCount = dataFrame['job'].value_counts()
axes[0, 0].bar(jobCount.index, jobCount.values)
axes[0, 0].set_title('Job Roles')
axes[0, 0].set_xlabel('Job Role')
axes[0, 0].set_ylabel('Count')

# Plot 2: education (Bar Chart)
educationCount = dataFrame['education'].value_counts()
axes[0, 1].bar(educationCount.index, educationCount.values)
axes[0, 1].set_title('Education Levels')
axes[0, 1].set_xlabel('Education Level')
axes[0, 1].set_ylabel('Count')

# Plot 3: Gender (Pie Chart)
genderCount = dataFrame['gender'].value_counts()
axes[1, 0].bar(genderCount.index, genderCount.values)
axes[1, 0].set_title('Gender')
axes[1, 0].set_xlabel('Gender')
axes[1, 0].set_ylabel('Count')

# Plot 4: English-Speaking Status (Pie Chart)
englishCount = dataFrame['English speaker'].value_counts()
axes[1, 1].bar(englishCount.index, englishCount.values)
axes[1, 1].set_title('English Speaking Status')
axes[1, 1].set_xlabel('English Speaker')
axes[1, 1].set_ylabel('Count')

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
#creating 2x2 grid of subplots , which has 4 grids each grid used for each column representation
#each plot represents the count of each data
fig, axs = plt.subplots(2, 2, figsize=(12, 8))


# Calculating & plotting the percentages for job roles
jobPercentages = jobCount * 100
axs[0, 0].pie(jobPercentages, labels=jobPercentages.index, autopct='%1.1f%%')
axs[0, 0].set_title('Job Roles')


# Calculating & plotting the percentages for education levels
educationPercentages = educationCount * 100
axs[0, 1].pie(educationPercentages, labels=educationPercentages.index, autopct='%1.1f%%')
axs[0, 1].set_title('Education Levels')

# Calculating & plotting the percentages for genders
genderPercentages = genderCount * 100
axs[1, 0].pie(genderPercentages, labels=genderPercentages.index, autopct='%1.1f%%')
axs[1, 0].set_title('Genders')


# Calculating & plotting the percentages for English-speaking groups
englishPercentages = englishCount * 100
axs[1, 1].pie(englishPercentages, labels=englishPercentages.index, autopct='%1.1f%%')
axs[1, 1].set_title('English-Speaking Groups')

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

# Task 3: Gender and English speaker Analysis

In [None]:
#1. Calculate the average education level for each gender group (Male, Female, Others).

# Grouping the data by 'Gender'
genderGroups = dataFrame.groupby('gender')

#calculating the average education level
average_education = genderGroups['education'].value_counts().unstack()


# Plot the results
average_education.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Average Education Level by Gender')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.legend(title='Education Level')
plt.show()

#Results
average_education['Total'] = average_education.sum(axis=1)
average_education = average_education.div(average_education['Total'], axis=0).drop(columns='Total')
print(average_education)


In [None]:
#2. Comparing the distribution of job roles among different gender groups using a stacked bar chart.

gender_job_groups = dataFrame.groupby(['gender', 'job']).size().unstack(fill_value=0)

# Plot the stacked bar chart
gender_job_groups.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Distribution of Job Roles Among Different Genders')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.legend(title='Job Role')

# Show the plot
plt.show()

In [None]:
#3. Create a histogram to show the distribution of education levels among English speaking and non-English speaking individuals.


#Spliting the data into two groups: English-speaking and non-English speaking
english_speaking = dataFrame[dataFrame['English speaker'] == 'yes']
non_english_speaking = dataFrame[dataFrame['English speaker'] == 'no']

# Create histograms for education levels in each group
plt.figure(figsize=(10, 6))

plt.hist(english_speaking['education'], bins=3, alpha=0.5, label='English Speaking')
plt.hist(non_english_speaking['education'], bins=3, alpha=0.5, label='Non-English Speaking')

plt.title('Distribution of Education Levels Among English Speaking and Non-English Speaking Individuals')
plt.xlabel('Education Level')
plt.ylabel('Count')
plt.legend()

# Show the plot
plt.show()

# Task 4: Predictive Modeling

In [None]:
#1. Encode categorical variables (job, education, gender, English speaker) using appropriate techniques (e.g., one-hot encoding).
from sklearn.preprocessing import LabelEncoder

# Using One-Hot-Encoding for categorical Variables
df_encoded = pd.get_dummies(dataFrame, columns=['job', 'education', 'English speaker'])

print(df_encoded)

In [None]:
# 2. Split the dataset into training and testing sets (80% training, 20% testing)

from sklearn.model_selection import train_test_split

# Definig our targeted column which is gender
X = df_encoded[['job_admin', 'job_custodial', 'job_manage', 'education_8', 'education_12','education_14','education_15','education_16','education_17','education_18','education_19','education_20','education_21','English speaker_no','English speaker_yes']]
y = dataFrame['gender']

# Splitting the data into training(80%) test(20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#3. Build a classification model to predict the gender of individuals based on job role,education level, and English-speaking status.

from sklearn.ensemble import RandomForestClassifier

# Random Forest Classification Model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Making predictions on the test data of 80%
y_pred = clf.predict(X_test)



In [None]:
#4. Evaluate the model's performance using accuracy, precision, recall, and F1-score metrics.

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

In [None]:
#5. Use feature importance techniques (e.g., feature importance scores, permutation feature importance) to identify the most influential features for gender prediction.

from sklearn.inspection import permutation_importance

# Calculating feature importance using permutation importance
perm_importance = permutation_importance(clf, X_test, y_test, n_repeats=30, random_state=42)

# Getting the feature names and their importances
feature_names = X_test.columns
importances = perm_importance.importances_mean

# Creating a DataFrame to store feature names and their importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sorting the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the top influential features
print("Top Influential Features:")
print(importance_df)

In [None]:
#6. Visualizing the ROC curve and AUC score for the gender prediction model.

# Calculate ROC curve
y_probs = clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs,pos_label=1)

# Calculate AUC (Area Under the ROC Curve)
roc_auc = auc(fpr, tpr)

# Visualize ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
