## Section 1: Import Libraries and Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import chi2_contingency
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Section 2: Create Dataset

In [None]:
# Create the dataset
data = {
    'S.No': [1, 2, 3, 4, 5, 6, 7],
    'Gender': ['M', 'F', 'M', 'M', 'F', 'F', 'F'],
    'Education': ['UG', 'PG', 'PhD', 'PhD', 'UG', 'PG', 'PG'],
    'Experience': [10, 5, 12.5, 15, 20, 30, 12],
    'Salary': [20, 40, 50, 30, 10, 40, 20]
}

df = pd.DataFrame(data)

print("Dataset:")
print("=" * 80)
print(df.to_string(index=False))
print("=" * 80)
print(f"\nShape: {df.shape[0]} observations, {df.shape[1]} variables")

## Section 3: Contingency Table Analysis

In [None]:
print("\n" + "="*80)
print("CONTINGENCY TABLE: Gender vs Education")
print("="*80)

# Create contingency table
contingency_table = pd.crosstab(df['Gender'], df['Education'], margins=True)
print("\nCounts:")
print(contingency_table.to_string())

# Get the actual contingency table (without margins)
cont_table = pd.crosstab(df['Gender'], df['Education'])

print("\n" + "-"*80)
print("\nProportion Table (Row Percentages):")
row_prop = pd.crosstab(df['Gender'], df['Education'], normalize='index') * 100
print(row_prop.round(2).to_string())

print("\n" + "-"*80)
print("\nProportion Table (Column Percentages):")
col_prop = pd.crosstab(df['Gender'], df['Education'], normalize='columns') * 100
print(col_prop.round(2).to_string())

print("\n" + "-"*80)
print("\nProportion Table (Total Percentages):")
total_prop = pd.crosstab(df['Gender'], df['Education'], normalize='all') * 100
print(total_prop.round(2).to_string())

print("\n" + "="*80)

## Section 4: Descriptive Analysis

In [None]:
print("\n" + "="*80)
print("DESCRIPTIVE ANALYSIS: Gender-Education Distribution")
print("="*80)

print("\nGender Distribution:")
print(df['Gender'].value_counts().to_string())
print(f"\nMale: {len(df[df['Gender']=='M'])} employees ({len(df[df['Gender']=='M'])/len(df)*100:.1f}%)")
print(f"Female: {len(df[df['Gender']=='F'])} employees ({len(df[df['Gender']=='F'])/len(df)*100:.1f}%)")

print("\n" + "-"*80)
print("\nEducation Distribution:")
print(df['Education'].value_counts().sort_index().to_string())
print(f"\nUG: {len(df[df['Education']=='UG'])} employees ({len(df[df['Education']=='UG'])/len(df)*100:.1f}%)")
print(f"PG: {len(df[df['Education']=='PG'])} employees ({len(df[df['Education']=='PG'])/len(df)*100:.1f}%)")
print(f"PhD: {len(df[df['Education']=='PhD'])} employees ({len(df[df['Education']=='PhD'])/len(df)*100:.1f}%)")

print("\n" + "-"*80)
print("\nEducation Distribution by Gender:")
print()
for gender in ['M', 'F']:
    print(f"{gender}ale Employees:")
    subset = df[df['Gender']==gender]
    for edu in ['UG', 'PG', 'PhD']:
        count = len(subset[subset['Education']==edu])
        pct = count/len(subset)*100 if len(subset) > 0 else 0
        print(f"  {edu}: {count} ({pct:.1f}%)")
    print()

print("="*80)

## Section 5: Chi-Square Test of Independence

In [None]:
print("\n" + "="*80)
print("CHI-SQUARE TEST OF INDEPENDENCE")
print("="*80)

print("\nNull Hypothesis (H₀):")
print("  Gender and Education are INDEPENDENT")
print("  (No association between gender and education level)")
print()
print("Alternative Hypothesis (H₁):")
print("  Gender and Education are NOT independent")
print("  (There IS an association between gender and education level)")
print()
print("Significance Level: α = 0.05")

# Perform chi-square test
chi2, p_value, dof, expected_freq = chi2_contingency(cont_table)

print("\n" + "-"*80)
print("\nTest Results:")
print(f"  Chi-Square Statistic (χ²): {chi2:.4f}")
print(f"  P-value: {p_value:.4f}")
print(f"  Degrees of Freedom: {dof}")
print(f"  Significance Level (α): 0.05")
print()
print("-"*80)
print("\nExpected Frequencies (under independence):")
expected_df = pd.DataFrame(expected_freq, 
                          index=cont_table.index, 
                          columns=cont_table.columns)
print(expected_df.round(2).to_string())
print()
print("-"*80)
print("\nComparison: Observed vs Expected")
print()
print("Observed Frequencies:")
print(cont_table.to_string())
print()
print("Difference (Observed - Expected):")
diff_df = cont_table.astype(float) - expected_df
print(diff_df.round(2).to_string())

print("\n" + "-"*80)
print("\nDecision:")
if p_value < 0.05:
    print(f"  ✓ P-value ({p_value:.4f}) < α (0.05)")
    print("  REJECT the null hypothesis")
    print()
print("  Conclusion: There IS a statistically significant association")
    print("  between gender and education level at the 5% significance level.")
    conclusion = "SIGNIFICANT ASSOCIATION"
else:
    print(f"  ✗ P-value ({p_value:.4f}) ≥ α (0.05)")
    print("  FAIL TO REJECT the null hypothesis")
    print()
print("  Conclusion: There is NO statistically significant association")
    print("  between gender and education level at the 5% significance level.")
    conclusion = "NO SIGNIFICANT ASSOCIATION"

print("\n" + "="*80)

## Section 6: Association Measures

In [None]:
print("\n" + "="*80)
print("ASSOCIATION MEASURES")
print("="*80)

n = len(df)
min_dim = min(cont_table.shape[0] - 1, cont_table.shape[1] - 1)

# Cramér's V
cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0

print(f"\n1. CRAMÉR'S V (Effect Size):")
print(f"   Formula: V = √[χ² / (n × min(k-1, r-1))]")
print(f"   Where: n = sample size, k = number of columns, r = number of rows")
print(f"   ")
print(f"   Calculation: V = √[{chi2:.4f} / ({n} × {min_dim})]")
print(f"   Cramér's V = {cramers_v:.4f}")
print()
print(f"   Interpretation of Cramér's V (for 2×3 table):")
print(f"   0.00 - 0.07: Negligible association")
print(f"   0.07 - 0.21: Weak association")
print(f"   0.21 - 0.35: Moderate association")
print(f"   > 0.35: Strong association")
print()
if cramers_v < 0.07:
    association_strength = "NEGLIGIBLE"
elif cramers_v < 0.21:
    association_strength = "WEAK"
elif cramers_v < 0.35:
    association_strength = "MODERATE"
else:
    association_strength = "STRONG"

print(f"   ➜ Your data: {association_strength} association (V = {cramers_v:.4f})")

print("\n" + "-"*80)
print(f"\n2. PHI COEFFICIENT (for 2×2 tables):")
print(f"   Note: Your table is 2×3, so Phi is less appropriate")
print(f"   but provided for reference if considering binary grouping")
if cont_table.shape == (2, 2):
    phi = np.sqrt(chi2 / n)
    print(f"   Phi = {phi:.4f}")
else:
    print(f"   (Not directly applicable to 2×3 table)")

print("\n" + "-"*80)
print(f"\n3. RELATIVE RISK / ODDS RATIOS:")
print(f"   Comparing specific education levels between genders")
print()

# PhD comparison
male_phd = len(df[(df['Gender']=='M') & (df['Education']=='PhD')])
female_phd = len(df[(df['Gender']=='F') & (df['Education']=='PhD')])
male_total = len(df[df['Gender']=='M'])
female_total = len(df[df['Gender']=='F'])

if male_phd > 0 and female_phd > 0:
    male_phd_rate = male_phd / male_total
    female_phd_rate = female_phd / female_total
    print(f"   PhD Degree:")
    print(f"   - Males with PhD: {male_phd}/{male_total} = {male_phd_rate:.2%}")
    print(f"   - Females with PhD: {female_phd}/{female_total} = {female_phd_rate:.2%}")
    print()

print("\n" + "="*80)

## Section 7: Visualizations

In [None]:
# Create comprehensive visualizations
fig = plt.figure(figsize=(16, 12))
gs = fig.add_gridspec(3, 3, hspace=0.35, wspace=0.35)

# Plot 1: Stacked Bar Chart (Counts)
ax1 = fig.add_subplot(gs[0, 0])
cont_table.T.plot(kind='bar', ax=ax1, color=['lightblue', 'lightpink'], width=0.7)
ax1.set_title('Education by Gender (Counts)', fontweight='bold', fontsize=11)
ax1.set_xlabel('Education Level', fontweight='bold')
ax1.set_ylabel('Number of Employees', fontweight='bold')
ax1.legend(title='Gender', labels=['Male', 'Female'])
ax1.grid(True, alpha=0.3, axis='y')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=0)

# Plot 2: Grouped Bar Chart
ax2 = fig.add_subplot(gs[0, 1])
x = np.arange(len(cont_table.columns))
width = 0.35
ax2.bar(x - width/2, cont_table.loc['M'], width, label='Male', color='lightblue', edgecolor='black')
ax2.bar(x + width/2, cont_table.loc['F'], width, label='Female', color='lightpink', edgecolor='black')
ax2.set_xlabel('Education Level', fontweight='bold')
ax2.set_ylabel('Count', fontweight='bold')
ax2.set_title('Gender Distribution by Education Level', fontweight='bold', fontsize=11)
ax2.set_xticks(x)
ax2.set_xticklabels(cont_table.columns)
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# Plot 3: Percentage Stacked Bar Chart
ax3 = fig.add_subplot(gs[0, 2])
row_percentages = cont_table.T.div(cont_table.T.sum(axis=1), axis=0) * 100
row_percentages.plot(kind='bar', stacked=True, ax=ax3, color=['lightblue', 'lightpink'], width=0.7)
ax3.set_title('Education Distribution (% within each level)', fontweight='bold', fontsize=11)
ax3.set_xlabel('Education Level', fontweight='bold')
ax3.set_ylabel('Percentage (%)', fontweight='bold')
ax3.legend(title='Gender', labels=['Male', 'Female'])
ax3.set_ylim(0, 100)
plt.setp(ax3.xaxis.get_majorticklabels(), rotation=0)

# Plot 4: Heatmap of Observed Frequencies
ax4 = fig.add_subplot(gs[1, 0])
sns.heatmap(cont_table, annot=True, fmt='d', cmap='YlOrRd', ax=ax4, cbar_kws={'label': 'Count'})
ax4.set_title('Observed Frequencies Heatmap', fontweight='bold')
ax4.set_ylabel('Gender')

# Plot 5: Heatmap of Expected Frequencies
ax5 = fig.add_subplot(gs[1, 1])
expected_rounded = expected_df.round(1)
sns.heatmap(expected_rounded, annot=True, fmt='.1f', cmap='Blues', ax=ax5, cbar_kws={'label': 'Expected Count'})
ax5.set_title('Expected Frequencies Heatmap', fontweight='bold')
ax5.set_ylabel('Gender')

# Plot 6: Heatmap of Standardized Residuals
ax6 = fig.add_subplot(gs[1, 2])
std_residuals = (cont_table.astype(float) - expected_df) / np.sqrt(expected_df)
sns.heatmap(std_residuals, annot=True, fmt='.2f', cmap='RdBu_r', center=0, ax=ax6, cbar_kws={'label': 'Std. Residual'})
ax6.set_title('Standardized Residuals Heatmap', fontweight='bold')
ax6.set_ylabel('Gender')

# Plot 7: Pie Chart - Male Education
ax7 = fig.add_subplot(gs[2, 0])
male_edu = df[df['Gender']=='M']['Education'].value_counts()
colors_pie = plt.cm.Blues(np.linspace(0.4, 0.8, len(male_edu)))
ax7.pie(male_edu.values, labels=male_edu.index, autopct='%1.1f%%', colors=colors_pie, startangle=90)
ax7.set_title(f'Male Education Distribution (n={len(df[df["Gender"]== "M"])})', fontweight='bold')

# Plot 8: Pie Chart - Female Education
ax8 = fig.add_subplot(gs[2, 1])
female_edu = df[df['Gender']=='F']['Education'].value_counts()
colors_pie = plt.cm.Reds(np.linspace(0.4, 0.8, len(female_edu)))
ax8.pie(female_edu.values, labels=female_edu.index, autopct='%1.1f%%', colors=colors_pie, startangle=90)
ax8.set_title(f'Female Education Distribution (n={len(df[df["Gender"]== "F"])})', fontweight='bold')

# Plot 9: Summary Text
ax9 = fig.add_subplot(gs[2, 2])
ax9.axis('off')
summary_text = f"""CHI-SQUARE TEST RESULTS
{'-'*30}
χ² = {chi2:.4f}
p-value = {p_value:.4f}
df = {dof}
α = 0.05

Cramér's V = {cramers_v:.4f}
({association_strength} Association)

Conclusion:
{conclusion}"""
ax9.text(0.1, 0.9, summary_text, transform=ax9.transAxes, fontsize=10,
        verticalalignment='top', fontfamily='monospace',
        bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))

plt.suptitle('Gender-Education Association Analysis', fontsize=16, fontweight='bold', y=0.995)
plt.savefig('gender_education_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("✓ Visualization saved as 'gender_education_analysis.png'")

## Section 8: Detailed Interpretation of Chi-Square Results

In [None]:
print("\n" + "="*80)
print("DETAILED INTERPRETATION")
print("="*80)

print("\n1. CHI-SQUARE TEST INTERPRETATION:")
print(f"   Chi-Square Statistic: {chi2:.4f}")
print(f"   P-value: {p_value:.4f}")
print(f"   Significance Level: 0.05")
print()
if p_value < 0.05:
    print(f"   Since p-value ({p_value:.4f}) < 0.05:")
    print("   We REJECT the null hypothesis of independence.")
    print("   There IS a statistically significant association between gender and education.")
else:
    print(f"   Since p-value ({p_value:.4f}) ≥ 0.05:")
    print("   We FAIL TO REJECT the null hypothesis of independence.")
    print("   There is NO statistically significant association between gender and education.")

print("\n" + "-"*80)
print("\n2. EFFECT SIZE (CRAMÉR'S V):")
print(f"   Cramér's V = {cramers_v:.4f} ({association_strength})")
print()
if association_strength == "NEGLIGIBLE":
    print("   Even if significant, the practical association is negligible.")
elif association_strength == "WEAK":
    print("   There is a weak association in the data.")
elif association_strength == "MODERATE":
    print("   There is a moderate association between the variables.")
else:
    print("   There is a strong association between the variables.")

print("\n" + "-"*80)
print("\n3. PATTERN ANALYSIS:")
print()
print("   Distribution by Gender:")
for gender in ['M', 'F']:
    subset = df[df['Gender']==gender]
    print(f"\n   {gender}ale (n={len(subset)}):")
    for edu in ['UG', 'PG', 'PhD']:
        count = len(subset[subset['Education']==edu])
        pct = count/len(subset)*100 if len(subset) > 0 else 0
        print(f"     {edu}: {count} ({pct:.0f}%)")

print("\n" + "-"*80)
print("\n4. STANDARDIZED RESIDUALS:")
print("   (Show which cells contribute most to chi-square)")
print()
print(std_residuals.round(2).to_string())
print()
print("   Interpretation:")
print("   • Residuals > 2 or < -2 indicate larger deviations from expectation")
print("   • These cells contribute more to the chi-square statistic")
print()
print("="*80)

## Section 9: Assumptions and Limitations

In [None]:
print("\n" + "="*80)
print("ASSUMPTIONS AND LIMITATIONS OF CHI-SQUARE TEST")
print("="*80)

print("""
1. ASSUMPTIONS:
   ✓ Independence: Each observation falls into only one cell
   ✓ Random Sampling: Data should be from a random sample
   ✓ Expected Frequency: Generally, expected frequency ≥ 5 in at least 80% of cells

2. EXPECTED FREQUENCY CHECK:""")

min_expected = expected_df.min().min()
cells_below_5 = (expected_df < 5).sum().sum()
total_cells = expected_df.size

print(f"   Minimum Expected Frequency: {min_expected:.2f}")
print(f"   Cells with Expected Frequency < 5: {cells_below_5}/{total_cells}")
print(f"   Percentage: {cells_below_5/total_cells*100:.1f}%")
print()

if min_expected >= 5:
    print("   ✓ Assumption met: All expected frequencies ≥ 5")
else:
    print(f"   ⚠ Warning: Some expected frequencies < 5")
    print("   Consider using Fisher's exact test or combining categories")

print("""
3. LIMITATIONS OF THIS ANALYSIS:
   • Small sample size (n=7) limits statistical power
   • Results may not generalize to larger population
   • With small sample, p-values and effect sizes may be unstable
   • Cannot infer causation (even if association exists)
   • Sampling variability is high with small n

4. RECOMMENDATIONS:
   • Collect larger sample for more robust conclusions
   • Consider stratified sampling by organization/department
   • Track temporal patterns in education-gender relationship
   • Investigate underlying reasons for patterns found
   • Compare with industry benchmarks

5. IMPORTANT NOTES:
   • Statistical significance ≠ Practical significance
   • Effect size should be considered alongside p-value
   • With small samples, lack of significance may reflect low power, not no effect
   • Multiple factors may influence education (recruitment, retention, etc.)
""")

print("="*80)

## Section 10: Final Conclusions

## Final Summary

### Key Findings:

1. **Statistical Test Result**: {}
   - Chi-Square Statistic: {:.4f}
   - P-value: {:.4f}
   - Degrees of Freedom: {}

2. **Effect Size**: {} (Cramér's V = {:.4f})

3. **Data Characteristics**:
   - Total Sample: {} employees
   - Males: {} ({}%)
   - Females: {} ({}%)
   - Education Levels: UG, PG, PhD

4. **Distribution Pattern**:
   - Males: {}
   - Females: {}

### Conclusion:

**Is gender influencing education?**

{}

### Recommendations for Decision-Making:

1. **If no significant association**:
   - Education levels are similarly distributed across genders
   - No evidence of gender-based education disparities in current data
   - Continue monitoring with larger sample

2. **If significant association**:
   - Investigate underlying causes
   - Review recruitment practices
   - Examine career progression patterns
   - Consider educational advancement programs

3. **Data Quality Considerations**:
   - Verify data accuracy and completeness
   - Ensure consistent education classification
   - Document any missing or unclear cases
   - Plan for data collection with adequate sample size
""".format(
    conclusion,
    chi2,
    p_value,
    dof,
    association_strength,
    cramers_v,
    len(df),
    len(df[df['Gender']=='M']),
    len(df[df['Gender']=='M'])/len(df)*100,
    len(df[df['Gender']=='F']),
    len(df[df['Gender']=='F'])/len(df)*100,
    '; '.join([f"{edu}: {len(df[(df['Gender']=='M') & (df['Education']==edu)])}" for edu in ['UG', 'PG', 'PhD']]),
    '; '.join([f"{edu}: {len(df[(df['Gender']=='F') & (df['Education']==edu)])}" for edu in ['UG', 'PG', 'PhD']]),
    f"Based on statistical analysis: {conclusion.upper()}. The p-value is {p_value:.4f}, and the effect size (Cramér's V) is {cramers_v:.4f} ({association_strength.lower()}). " +
    ("This suggests that gender distribution across education levels differs more than would be expected by chance alone." if p_value < 0.05 else 
     "This suggests that gender and education level are independent in this sample.")
))