In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_clean_data(file_path):
    """
    Load and perform initial data cleaning
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Clean column names
    df.columns = df.columns.str.strip().str.lower()
    
    # Convert categorical columns to categorical type
    categorical_cols = ['register type', 'variant type', 'academic field', 'gender']
    for col in categorical_cols:
        df[col] = pd.Categorical(df[col])
    
    return df

def analyze_register_distribution(df):
    """
    Analyze the distribution of registers across different contexts
    """
    register_dist = df['register type'].value_counts()
    register_by_field = pd.crosstab(df['academic field'], df['register type'])
    register_by_gender = pd.crosstab(df['gender'], df['register type'])
    
    return {
        'overall_distribution': register_dist,
        'by_field': register_by_field,
        'by_gender': register_by_gender
    }

def analyze_variant_patterns(df):
    """
    Analyze patterns in variant usage
    """
    variant_dist = df['variant type'].value_counts()
    variant_by_register = pd.crosstab(df['register type'], df['variant type'])
    variant_by_field = pd.crosstab(df['academic field'], df['variant type'])
    
    return {
        'overall_distribution': variant_dist,
        'by_register': variant_by_register,
        'by_field': variant_by_field
    }

def chi_square_test(df, var1, var2):
    """
    Perform chi-square test of independence
    """
    contingency_table = pd.crosstab(df[var1], df[var2])
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    
    return {
        'chi2': chi2,
        'p_value': p_value,
        'dof': dof,
        'contingency_table': contingency_table
    }

def analyze_usage_patterns(df):
    """
    Analyze actual usage patterns and create frequency distributions
    """
    # Group by register type and variant type to see common patterns
    usage_patterns = df.groupby(['register type', 'variant type'])['actual usage'].value_counts()
    
    # Calculate type-token ratio for each register
    ttr_by_register = {}
    for register in df['register type'].unique():
        register_tokens = df[df['register type'] == register]['actual usage']
        types = len(set(register_tokens))
        tokens = len(register_tokens)
        ttr_by_register[register] = types / tokens if tokens > 0 else 0
        
    return {
        'usage_patterns': usage_patterns,
        'type_token_ratio': ttr_by_register
    }

def plot_distributions(df):
    """
    Create visualizations of the data distributions
    """
    
    # Create multiple plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot 1: Register distribution
    sns.countplot(data=df, x='register type', ax=axes[0,0])
    axes[0,0].set_title('Distribution of Registers')
    axes[0,0].set_xlabel('Register Type')
    axes[0,0].set_ylabel('Count')
    
    # Plot 2: Variant distribution by register
    register_variant = pd.crosstab(df['register type'], df['variant type'])
    register_variant.plot(kind='bar', stacked=True, ax=axes[0,1])
    axes[0,1].set_title('Variant Distribution by Register')
    axes[0,1].set_xlabel('Register Type')
    axes[0,1].set_ylabel('Count')
    
    # Plot 3: Academic field distribution
    sns.countplot(data=df, x='academic field', ax=axes[1,0])
    axes[1,0].set_title('Distribution by Academic Field')
    axes[1,0].set_xticklabels(axes[1,0].get_xticklabels(), rotation=45)
    
    # Plot 4: Gender distribution by register
    gender_register = pd.crosstab(df['gender'], df['register type'])
    gender_register.plot(kind='bar', ax=axes[1,1])
    axes[1,1].set_title('Register Distribution by Gender')
    
    plt.tight_layout()
    return fig

def generate_summary_statistics(df):
    """
    Generate comprehensive summary statistics for the dataset
    
    Parameters:
    df (pandas.DataFrame): The input dataframe containing sociolinguistic data
    
    Returns:
    dict: A dictionary containing various summary statistics organized by category
    """
    summary_stats = {}
    
    # Overall dataset statistics
    summary_stats['dataset_overview'] = {
        'total_observations': len(df),
        'unique_registers': df['register type'].nunique(),
        'unique_variants': df['variant type'].nunique(),
        'unique_fields': df['academic field'].nunique(),
        'missing_values': df.isnull().sum().to_dict()
    }
    
    # Register type statistics
    register_stats = df.groupby('register type').agg({
        'variant type': ['count', 'nunique'],
        'actual usage': ['count', 'nunique']
    })
    register_stats.columns = ['total_occurrences', 'unique_variants', 
                            'usage_tokens', 'unique_usage_types']
    summary_stats['register_statistics'] = register_stats.to_dict()
    
    # Calculate proportions for each categorical variable
    categorical_cols = ['register type', 'variant type', 'academic field', 'gender']
    proportions = {}
    for col in categorical_cols:
        prop = (df[col].value_counts() / len(df) * 100).round(2)
        proportions[col] = prop.to_dict()
    summary_stats['category_proportions'] = proportions
    
    # Cross-categorical analysis
    summary_stats['cross_categorical'] = {
        'register_by_field': pd.crosstab(
            df['register type'], 
            df['academic field'], 
            normalize='index'
        ).round(3).to_dict(),
        'variant_by_register': pd.crosstab(
            df['variant type'], 
            df['register type'], 
            normalize='index'
        ).round(3).to_dict()
    }
    
    # Usage pattern statistics
    usage_stats = df.groupby('register type')['actual usage'].agg([
        ('mean_usage_length', lambda x: x.str.len().mean()),
        ('median_usage_length', lambda x: x.str.len().median()),
        ('std_usage_length', lambda x: x.str.len().std()),
        ('unique_usage_ratio', lambda x: len(set(x)) / len(x))
    ]).round(3)
    summary_stats['usage_statistics'] = usage_stats.to_dict()
    
    # Descriptive statistics for numerical features (if any)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    if len(numerical_cols) > 0:
        summary_stats['numerical_statistics'] = df[numerical_cols].describe().to_dict()
    
    return summary_stats

def print_summary_report(summary_stats):
    """
    Print a formatted report of the summary statistics
    
    Parameters:
    summary_stats (dict): The dictionary containing summary statistics
    """
    print("\n=== SOCIOLINGUISTIC DATA ANALYSIS SUMMARY ===\n")
    
    # Dataset Overview
    print("DATASET OVERVIEW:")
    overview = summary_stats['dataset_overview']
    print(f"Total Observations: {overview['total_observations']}")
    print(f"Unique Registers: {overview['unique_registers']}")
    print(f"Unique Variants: {overview['unique_variants']}")
    print(f"Unique Academic Fields: {overview['unique_fields']}")
    
    # Category Proportions
    print("\nCATEGORY DISTRIBUTIONS:")
    for category, proportions in summary_stats['category_proportions'].items():
        print(f"\n{category.upper()}:")
        for item, percentage in proportions.items():
            print(f"  {item}: {percentage:.1f}%")
    
    # Usage Statistics
    print("\nUSAGE STATISTICS BY REGISTER:")
    usage_stats = summary_stats['usage_statistics']
    for metric in usage_stats.keys():
        print(f"\n{metric}:")
        for register, value in usage_stats[metric].items():
            print(f"  {register}: {value:.2f}")
    
    print("\n=== END OF SUMMARY ===")

def main():
    # Load and analyze data
    df = load_and_clean_data('sociolinguistic_data.csv')
    
    # Perform analyses
    register_analysis = analyze_register_distribution(df)
    variant_analysis = analyze_variant_patterns(df)
    usage_analysis = analyze_usage_patterns(df)
    
    # Perform statistical tests
    register_variant_test = chi_square_test(df, 'register type', 'variant type')
    register_gender_test = chi_square_test(df, 'register type', 'gender')
    
    # Create visualizations
    plot_distributions(df)

    summary_stats = generate_summary_statistics(df)
    print_summary_report(summary_stats)

    
    return {
        'register_analysis': register_analysis,
        'variant_analysis': variant_analysis,
        'usage_analysis': usage_analysis,
        'statistical_tests': {
            'register_variant': register_variant_test,
            'register_gender': register_gender_test
        },
        'summary_statistics': summary_stats  # Add this line to include summary stats in results

    }

if __name__ == "__main__":
    results = main()

In [8]:
def create_cross_tabulation_tables(df):
    """
    Create detailed cross-tabulation tables for variant analysis across different social and linguistic factors.
    
    Parameters:
    df (pandas.DataFrame): The input dataframe containing sociolinguistic data
    
    Returns:
    dict: A dictionary containing various cross-tabulation tables and statistical measures
    """
    tables = {}
    
    # 1. Basic variant frequency table across all factors
    tables['variant_frequencies'] = {
        'register': pd.crosstab(df['variant type'], df['register type']),
        'academic_field': pd.crosstab(df['variant type'], df['academic field']),
        'gender': pd.crosstab(df['variant type'], df['gender'])
    }
    
    # 2. Three-way cross-tabulation
    # Variant type × Register type × Gender
    tables['three_way'] = pd.crosstab(
        [df['variant type'], df['register type']],
        df['gender'],
        margins=True
    )
    
    # 3. Proportional tables (normalized by row)
    tables['proportional'] = {
        'register': pd.crosstab(
            df['variant type'],
            df['register type'],
            normalize='index'
        ).round(3) * 100,
        'academic_field': pd.crosstab(
            df['variant type'],
            df['academic field'],
            normalize='index'
        ).round(3) * 100,
        'gender': pd.crosstab(
            df['variant type'],
            df['gender'],
            normalize='index'
        ).round(3) * 100
    }
    
    # 4. Summary statistics for each variant
    variant_summary = df.groupby('variant type').agg({
        'actual usage': ['count', 'nunique'],
        'register type': 'nunique',
        'academic field': 'nunique',
        'gender': 'nunique'
    })
    variant_summary.columns = [
        'total_tokens',
        'unique_usage_types',
        'registers_used',
        'fields_used',
        'genders_represented'
    ]
    tables['variant_summary'] = variant_summary
    
    # 5. Calculate chi-square statistics for independence tests
    tables['chi_square_tests'] = {}
    factor_pairs = [
        ('variant type', 'register type'),
        ('variant type', 'academic field'),
        ('variant type', 'gender')
    ]
    
    for var1, var2 in factor_pairs:
        contingency = pd.crosstab(df[var1], df[var2])
        chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
        tables['chi_square_tests'][f'{var1}_vs_{var2}'] = {
            'chi2': chi2,
            'p_value': p_value,
            'dof': dof
        }
    
    return tables

def print_cross_tabulation_results(tables):
    """
    Print formatted results from the cross-tabulation analysis in a copy-paste friendly format
    
    Parameters:
    tables (dict): The dictionary containing the cross-tabulation tables and statistics
    """
    def format_table(df, title):
        """Helper function to format tables consistently"""
        # Convert DataFrame to string with consistent formatting
        table_str = df.to_string(float_format=lambda x: '{:.2f}'.format(x) if isinstance(x, float) else str(x))
        # Add title with appropriate underlining
        separator = '=' * len(title)
        return f"{title}\n{separator}\n{table_str}\n\n"
    
    # Create sections with clear separation and consistent formatting
    output = []
    
    # Header
    output.append("VARIANT DISTRIBUTION ANALYSIS")
    output.append("===========================\n")
    
    # 1. Frequency Tables
    output.append("1. VARIANT FREQUENCIES BY FACTOR")
    output.append("-------------------------------")
    for factor, table in tables['variant_frequencies'].items():
        title = f"{factor.upper()} DISTRIBUTION"
        output.append(format_table(table, title))
    
    # 2. Three-way Cross-tabulation
    output.append("2. THREE-WAY CROSS-TABULATION (Variant × Register × Gender)")
    output.append("-------------------------------------------------------")
    output.append(format_table(tables['three_way'], "Variant × Register × Gender Distribution"))
    
    # 3. Proportional Distributions
    output.append("3. PROPORTIONAL DISTRIBUTIONS (%)")
    output.append("--------------------------------")
    for factor, table in tables['proportional'].items():
        title = f"{factor.upper()} PROPORTIONS"
        output.append(format_table(table, title))
    
    # 4. Variant Summary Statistics
    output.append("4. VARIANT SUMMARY STATISTICS")
    output.append("--------------------------")
    output.append(format_table(tables['variant_summary'], "Summary Statistics by Variant"))
    
    # 5. Chi-square Test Results
    output.append("5. CHI-SQUARE TESTS OF INDEPENDENCE")
    output.append("----------------------------------")
    chi_square_results = []
    for test_name, results in tables['chi_square_tests'].items():
        chi_square_results.append(
            f"Test: {test_name}\n"
            f"  Chi-square statistic: {results['chi2']:.2f}\n"
            f"  p-value: {results['p_value']:.4f}\n"
            f"  Degrees of freedom: {results['dof']}\n"
            f"  Significant: {'Yes' if results['p_value'] < 0.05 else 'No'}\n"
        )
    output.append("\n".join(chi_square_results))
    
    # Print the formatted output
    print("\n".join(output))

def visualize_cross_tabulations(df, tables):
    """
    Create visualizations for the cross-tabulation results
    
    Parameters:
    df (pandas.DataFrame): The input dataframe
    tables (dict): The dictionary containing the cross-tabulation tables
    
    Returns:
    matplotlib.figure.Figure: The figure containing all plots
    """
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Variant Distribution Across Social and Linguistic Factors', fontsize=16)
    
    # 1. Heatmap of variant × register frequencies
    sns.heatmap(
        tables['variant_frequencies']['register'],
        annot=True,
        fmt='d',
        cmap='YlGnBu',
        ax=axes[0, 0]
    )
    axes[0, 0].set_title('Variant Frequencies by Register')
    
    # 2. Heatmap of variant × academic field frequencies
    sns.heatmap(
        tables['variant_frequencies']['academic_field'],
        annot=True,
        fmt='d',
        cmap='YlGnBu',
        ax=axes[0, 1]
    )
    axes[0, 1].set_title('Variant Frequencies by Academic Field')
    
    # 3. Variant summary statistics visualization
    tables['variant_summary']['total_tokens'].plot(
        kind='bar',
        ax=axes[1, 0]
    )
    axes[1, 0].set_title('Total Tokens by Variant')
    axes[1, 0].set_xlabel('Variant Type')
    axes[1, 0].set_ylabel('Number of Tokens')
    
    # 4. Proportional distribution by gender
    tables['proportional']['gender'].plot(
        kind='bar',
        stacked=True,
        ax=axes[1, 1]
    )
    axes[1, 1].set_title('Variant Distribution by Gender (%)')
    axes[1, 1].set_xlabel('Variant Type')
    axes[1, 1].set_ylabel('Percentage')
    
    plt.tight_layout()
    return fig

In [9]:
df  = load_and_clean_data('sociolinguistic_data.csv')
cross_tab_tables = create_cross_tabulation_tables(df)
print_cross_tabulation_results(cross_tab_tables)

VARIANT DISTRIBUTION ANALYSIS

1. VARIANT FREQUENCIES BY FACTOR
-------------------------------
REGISTER DISTRIBUTION
register type  Academic  Casual
variant type                   
Discourse            17      17
Lexical              17      17
Syntactic            16      16


ACADEMIC_FIELD DISTRIBUTION
academic field  Anthropology  Bio  CS  Chemistry  Economics  English  Math  Philosophy  Physics  Psychology  Sociology  math  social studies
variant type                                                                                                                                
Discourse                  0    6   2          2          4        4     1           0        4           0          2     5               4
Lexical                    2    2   4          0          2        4     0           8        2           2          4     2               2
Syntactic                  6    2   2          8          2        0     0           0        2           6          2     0    

  variant_summary = df.groupby('variant type').agg({
