# Exploratory Analysis: Skill Metrics in Digital Labor Markets
    
    
 This notebook explores the skill metrics data and examines relationships between skill characteristics and market outcomes.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sys

# Add the src directory to path so we can import our modules
sys.path.append('../src/python/')

# Import our custom modules
from skill_metrics_processor import SkillDepthCalculator, calculate_skill_diversity
from visualization_generator import plot_time_series, plot_relationship_skill_metric_outcome

# Set plot style
plt.style.use('seaborn-whitegrid')
sns.set_palette('Set2')
plt.rcParams['figure.figsize'] = (12, 8)
%matplotlib inline

## 1. Load and Examine the Data

First, we'll load the processed skill metrics data and examine its structure.

In [None]:
# Load data
# df = pd.read_csv('../data/input/your_data_file.csv')

# Convert date columns to datetime
date_columns = [col for col in df.columns if 'date' in col.lower()]
for col in date_columns:
    df[col] = pd.to_datetime(df[col])

# Display info about the dataset
print(f"Dataset shape: {df.shape}")
print("\nData types:")
print(df.dtypes)

# Display first few rows
df.head()

## 2. Data Preparation

Let's prepare the data for analysis by:

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values by column:")
print(missing_values[missing_values > 0])

# Define reference date (e.g., policy change, technology adoption date)
reference_date = pd.to_datetime('2023-01-01')  # Replace with actual reference date

# Create period indicator (before/after reference date)
df['period'] = (df['date'] >= reference_date).astype(int)

# Filter to relevant time period for analysis
analysis_df = df[
    (df['date'] >= reference_date - pd.Timedelta(days=90)) & 
    (df['date'] <= reference_date + pd.Timedelta(days=90))
].copy()

# Handle outliers in skill metrics
for metric in ['individual_skill_depth', 'skill_diversity']:
    if metric in analysis_df.columns:
        # Calculate z-scores
        z_scores = stats.zscore(analysis_df[metric].fillna(0))
        # Filter out extreme outliers (z > 3)
        analysis_df = analysis_df[abs(z_scores) < 3]

print(f"Analysis dataset shape after filtering: {analysis_df.shape}")

## 3. Descriptive Statistics

Let's examine the distributions and summary statistics of our key metrics.

In [None]:
# Summary statistics for key metrics
metrics = ['individual_skill_depth', 'accumulated_skill_count', 'skill_diversity']
metrics = [m for m in metrics if m in analysis_df.columns]

print("Overall Summary Statistics:")
print(analysis_df[metrics].describe())

# Compare before and after reference date
print("\nBefore Reference Date:")
print(analysis_df[analysis_df['period'] == 0][metrics].describe())

print("\nAfter Reference Date:")
print(analysis_df[analysis_df['period'] == 1][metrics].describe())

# Visualize distributions
fig, axes = plt.subplots(1, len(metrics), figsize=(15, 5))

for i, metric in enumerate(metrics):
    sns.histplot(
        data=analysis_df, 
        x=metric, 
        hue='period',
        kde=True,
        ax=axes[i]
    )
    axes[i].set_title(f'Distribution of {metric}')
    axes[i].legend(['Before', 'After'])

plt.tight_layout()
plt.show()

## 4. Time Series Analysis

Let's examine how skill metrics change over time, particularly around the reference date.

In [None]:
# Time series of skill depth
if 'individual_skill_depth' in analysis_df.columns:
    depth_plot = plot_time_series(
        df=analysis_df,
        metric_column='individual_skill_depth',
        title='Skill Depth Over Time',
        reference_date=reference_date
    )
    plt.show()

# Time series of skill diversity
if 'skill_diversity' in analysis_df.columns:
    diversity_plot = plot_time_series(
        df=analysis_df,
        metric_column='skill_diversity',
        title='Skill Diversity Over Time',
        reference_date=reference_date
    )
    plt.show()

## 5. Relationship Between Skill Metrics and Outcomes

Now let's examine how skill metrics relate to important market outcomes.

In [None]:
# Relationship between skill depth and success rate
if 'individual_skill_depth' in analysis_df.columns and 'success_rate' in analysis_df.columns:
    depth_success_plot = plot_relationship_skill_metric_outcome(
        df=analysis_df,
        x_metric='individual_skill_depth',
        y_outcome='success_rate',
        x_label='Skill Depth',
        y_label='Success Rate',
        title='Relationship Between Skill Depth and Success Rate'
    )
    plt.show()

# Relationship between skill diversity and earnings
if 'skill_diversity' in analysis_df.columns and 'earnings' in analysis_df.columns:
    diversity_earnings_plot = plot_relationship_skill_metric_outcome(
        df=analysis_df,
        x_metric='skill_diversity',
        y_outcome='earnings',
        x_label='Skill Diversity',
        y_label='Earnings',
        title='Relationship Between Skill Diversity and Earnings'
    )
    plt.show()

## 6. User Segmentation

Let's segment users based on their skill characteristics to understand different profiles.

In [None]:
# Create segments based on skill depth and diversity
if 'individual_skill_depth' in analysis_df.columns and 'skill_diversity' in analysis_df.columns:
    # Calculate medians for segmentation
    depth_median = analysis_df['individual_skill_depth'].median()
    diversity_median = analysis_df['skill_diversity'].median()
    
    # Create segment labels
    analysis_df['depth_segment'] = np.where(analysis_df['individual_skill_depth'] >= depth_median, 'High', 'Low')
    analysis_df['diversity_segment'] = np.where(analysis_df['skill_diversity'] >= diversity_median, 'High', 'Low')
    
    # Combine segments
    analysis_df['user_segment'] = analysis_df['depth_segment'] + '-' + analysis_df['diversity_segment']
    
    # Count users in each segment
    segment_counts = analysis_df.groupby(['period', 'user_segment']).size().unstack()
    print("User counts by segment and period:")
    print(segment_counts)
    
    # Plot segments
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        data=analysis_df,
        x='individual_skill_depth',
        y='skill_diversity',
        hue='user_segment',
        style='period',
        alpha=0.7
    )
    plt.axvline(x=depth_median, color='gray', linestyle='--')
    plt.axhline(y=diversity_median, color='gray', linestyle='--')
    plt.title('User Segmentation by Skill Characteristics')
    plt.xlabel('Skill Depth')
    plt.ylabel('Skill Diversity')
    plt.legend(title='Segment')
    plt.show()

## 7. Statistical Tests

Perform statistical tests to validate findings.

In [None]:
# T-test comparing skill metrics before and after reference date
for metric in metrics:
    before = analysis_df[analysis_df['period'] == 0][metric].dropna()
    after = analysis_df[analysis_df['period'] == 1][metric].dropna()
    
    # Only perform test if we have sufficient data
    if len(before) > 30 and len(after) > 30:
        t_stat, p_value = stats.ttest_ind(before, after, equal_var=False)
        print(f"T-test for {metric}:")
        print(f"  t-statistic: {t_stat:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Mean before: {before.mean():.4f}")
        print(f"  Mean after: {after.mean():.4f}")
        print(f"  Mean difference: {after.mean() - before.mean():.4f}")
        print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'}\n")

## 8. Conclusions

Based on the exploratory analysis, we can draw the following conclusions:

1. **Changes in Skill Metrics**: [Your findings about how skill depth and diversity changed over time]

2. **Relationship with Outcomes**: [Your findings about relationships between skill metrics and outcomes]

3. **User Segments**: [Your findings about different user segments and their characteristics]

4. **Statistical Significance**: [Summary of statistical test results]

These findings suggest [broader implications for the research question].

### Next Steps

1. Conduct formal regression analysis using the Stata scripts
2. Explore additional metrics and relationships
3. Investigate heterogeneous effects across different user segments
4. Create visualizations for presentation and publication