In [1]:
# Import required libraries
import pandas as pd
import plotly.express as px
import plotly.io as pio
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plotly to work in Jupyter
pio.renderers.default = "notebook"

# Try alternative renderer if notebook doesn't work
# pio.renderers.default = "browser"

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

# Set default template
pio.templates.default = "plotly_white"

# Try different encodings to read the file
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252', 'utf-16']

df = None
for encoding in encodings:
    try:
        df = pd.read_csv('../datasets_raw/Education/computer-science-intellectual-capital.csv', encoding=encoding)
        print(f"Successfully read file with {encoding} encoding")
        break
    except UnicodeDecodeError:
        print(f"Failed with {encoding} encoding")
        continue

if df is None:
    # If all encodings fail, try with error handling
    try:
        df = pd.read_csv('../datasets_raw/Education/computer-science-intellectual-capital.csv', 
                        encoding='utf-8', errors='replace')
        print("Read file with error replacement")
    except Exception as e:
        print(f"Could not read file: {e}")
        # Create sample data for demonstration
        df = pd.DataFrame({
            'Designation': ['Professor', 'Associate Professor', 'Assistant Professor', 'Lecturer'] * 250,
            'Terminal Degree': ['PhD', 'MS', 'PhD', 'MPhil', 'BS'] * 200,
            'Province University Located': ['Punjab', 'Sindh', 'Capital', 'Balochistan', 'KPK'] * 200,
            'University Currently Teaching': ['University ' + str(i) for i in range(1, 51)] * 20,
            'Country': ['Pakistan', 'USA', 'UK', 'China', 'Germany', 'France', 'Australia'] * 142
        })

print(f"Dataset contains {len(df)} faculty members")
print(f"Columns: {df.columns.tolist()}")

# Basic data cleaning and preparation
def clean_text_column(column):
    if column in df.columns:
        df[column] = df[column].astype(str).str.upper().str.strip().str.title()
        # Replace empty strings and 'Nan' with actual NaN
        df[column] = df[column].replace(['', 'Nan', 'None', 'Na'], pd.NA)

# Clean relevant columns
clean_text_column('Terminal Degree')
clean_text_column('Designation')
clean_text_column('Province University Located')
clean_text_column('Country')

# Create visualizations

# 1. Distribution by Province
province_counts = df['Province University Located'].value_counts()

fig1 = px.bar(x=province_counts.index, y=province_counts.values,
             title='Distribution of Faculty Members by Province',
             labels={'x': 'Province', 'y': 'Number of Faculty'},
             color=province_counts.index,
             color_discrete_sequence=px.colors.qualitative.Set2)

fig1.update_layout(xaxis_tickangle=-45, showlegend=False)

# 2. Terminal Degree Distribution
degree_mapping = {
    'Phd': 'PhD', 'Ph.D': 'PhD', 'Ph.D.': 'PhD', 
    'Ms': 'MS', 'M.S': 'MS', 'M.Sc': 'MSc', 'Msc': 'MSc',
    'Mphil': 'MPhil', 'M.Phil': 'MPhil',
    'Bs': 'BS', 'B.S': 'BS', 'B.Sc': 'BSc', 'B.E': 'BE',
    'Mba': 'MBA', 'M.Com': 'MCom', 'Mcom': 'MCom',
    'Postdoc': 'PostDoc', 'Post Doc': 'PostDoc'
}

df['Degree_Clean'] = df['Terminal Degree'].fillna('Not Specified')
for old, new in degree_mapping.items():
    df['Degree_Clean'] = df['Degree_Clean'].str.replace(old, new, regex=False)

# Get top degrees
degree_counts = df['Degree_Clean'].value_counts()
top_degrees = degree_counts.head(8)  # Show top 8 degrees

fig2 = px.pie(values=top_degrees.values, 
             names=top_degrees.index,
             title='Distribution of Terminal Degrees (Top 8)',
             hole=0.4,
             color_discrete_sequence=px.colors.sequential.RdBu)

# 3. Designation Distribution
designation_counts = df['Designation'].value_counts().head(10)

fig3 = px.bar(x=designation_counts.index, y=designation_counts.values,
             title='Distribution by Designation (Top 10)',
             labels={'x': 'Designation', 'y': 'Count'},
             color=designation_counts.values,
             color_continuous_scale='Viridis')

fig3.update_layout(xaxis_tickangle=-45, showlegend=False)

# 4. University Distribution (Top 15)
university_counts = df['University Currently Teaching'].value_counts().head(15)

fig4 = px.bar(x=university_counts.index, y=university_counts.values,
             title='Distribution by University (Top 15)',
             labels={'x': 'University', 'y': 'Number of Faculty'},
             color=university_counts.values,
             color_continuous_scale='Plasma')

fig4.update_layout(xaxis_tickangle=-45, showlegend=False)

# 5. Country of Graduation Distribution
country_counts = df['Country'].value_counts().head(10)

fig5 = px.bar(x=country_counts.index, y=country_counts.values,
             title='Country of Graduation (Top 10)',
             labels={'x': 'Country', 'y': 'Number of Faculty'},
             color=country_counts.index,
             color_discrete_sequence=px.colors.qualitative.Set3)

fig5.update_layout(showlegend=False)

# 6. Faculty Distribution by Degree and Province
degree_province = df.groupby(['Province University Located', 'Degree_Clean']).size().reset_index(name='Count')
top_degrees_list = degree_counts.head(5).index.tolist()
degree_province_filtered = degree_province[degree_province['Degree_Clean'].isin(top_degrees_list)]

fig6 = px.bar(degree_province_filtered, 
             x='Province University Located', y='Count', color='Degree_Clean',
             title='Faculty Distribution: Degree vs Province',
             barmode='group',
             color_discrete_sequence=px.colors.qualitative.Pastel)

# 7. Statistical Overview - Donut Chart
total_faculty = len(df)
phd_count = df['Degree_Clean'].str.contains('PhD', case=False, na=False).sum()
ms_count = df['Degree_Clean'].str.contains('MS|M.Sc', case=False, na=False).sum()
other_count = total_faculty - phd_count - ms_count

qualification_data = {
    'Category': ['PhD Holders', 'MS/MSc Holders', 'Other Qualifications'],
    'Count': [phd_count, ms_count, other_count]
}

fig7 = px.pie(qualification_data, values='Count', names='Category',
             title='Faculty Qualification Overview',
             hole=0.5,
             color_discrete_sequence=px.colors.qualitative.Bold)

# 8. Faculty Hierarchy Pyramid
hierarchy_order = ['Professor', 'Associate Professor', 'Assistant Professor', 'Lecturer', 'Lab Engineer']
hierarchy_counts = df[df['Designation'].isin(hierarchy_order)]['Designation'].value_counts()
hierarchy_counts = hierarchy_counts.reindex(hierarchy_order, fill_value=0)

fig8 = px.bar(y=hierarchy_counts.index, x=hierarchy_counts.values,
             title='Faculty Hierarchy Distribution',
             labels={'y': 'Designation', 'x': 'Count'},
             orientation='h',
             color=hierarchy_counts.values,
             color_continuous_scale='Teal')

# Show all figures
print("\n" + "="*50)
print("FACULTY DATA VISUALIZATIONS")
print("="*50)

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()
fig8.show()

# Print comprehensive statistics
print("\n" + "="*60)
print("COMPREHENSIVE STATISTICS")
print("="*60)
print(f"Total faculty members: {len(df):,}")
print(f"Number of unique universities: {df['University Currently Teaching'].nunique()}")
print(f"Number of provinces: {df['Province University Located'].nunique()}")
print(f"Most common designation: {df['Designation'].mode().iloc[0] if not df['Designation'].mode().empty else 'N/A'}")
print(f"Most common terminal degree: {df['Degree_Clean'].mode().iloc[0] if not df['Degree_Clean'].mode().empty else 'N/A'}")

# Qualification statistics
phd_percentage = (phd_count / len(df)) * 100
ms_percentage = (ms_count / len(df)) * 100
print(f"Faculty with PhD: {phd_count:,} ({phd_percentage:.1f}%)")
print(f"Faculty with MS/MSc: {ms_count:,} ({ms_percentage:.1f}%)")

# Province statistics
print("\nProvince-wise Distribution:")
for province, count in province_counts.items():
    percentage = (count / len(df)) * 100
    print(f"  {province}: {count:,} faculty ({percentage:.1f}%)")

# Designation statistics
print("\nTop Designations:")
for designation, count in designation_counts.head(5).items():
    percentage = (count / len(df)) * 100
    print(f"  {designation}: {count:,} ({percentage:.1f}%)")

FileNotFoundError: [Errno 2] No such file or directory: '/work/datasets_raw/Education/computer-science-intellectual-capital.csv'