In [12]:
"""
This notebook analyzes the resulting data
"""

import pandas as pd

In [13]:
def basic_info(label_name):
    print("-- Preprocessed dataset --")
    
    # Load in the previous dataset
    dataset = pd.read_parquet("../data/df_preprocessed.parquet")
    
    # Get the number of missing values
    print(f"# of Missing/NaN values: {len(dataset.loc[dataset[label_name].isna()])}")

    # Get the value count for the label in percentages
    print("\nValue counts (%): ")
    print(dataset["satjob"].value_counts(normalize=True) * 100)
    

    print("\n\n\n-- Final dataset --")
    
    # Load in the final dataset
    dataset = pd.read_parquet("../data/df_preprocessed_with_predictions.parquet")
    
    # Get the number of missing values
    print(f"# of Missing/NaN values: {len(dataset.loc[dataset[label_name].isna()])}")

    # Get the value count for the label in percentages
    print("\nValue counts (%): ")
    print(dataset["satjob"].value_counts(normalize=True) * 100)

    return dataset

In [14]:
def tsne(cleaned, label_name):
    # T-SNE
    from sklearn.manifold import TSNE
    
    # Drop rows with any remaining NaNs (or use .fillna() if you prefer imputation)
    print(f"NaN remaining: {len(cleaned.dropna())}")
    cleaned = cleaned.dropna()
    
    # Reduce size for performance â€“ t-SNE is O(n^2)
    MAX_ROWS = 50000
    if len(cleaned) > MAX_ROWS:
        cleaned = cleaned.sample(MAX_ROWS, random_state=123)
        print(f"Sampled {MAX_ROWS} rows for t-SNE (original: {len(dataset)})")
    
    # Run t-SNE
    X_tsne = TSNE(n_components=2, random_state=123, perplexity=30).fit_transform(cleaned)
    
    # Plot
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], s=3, alpha=0.7)
    plt.title("t-SNE visualization (excluding satjob & satfin)")
    plt.xlabel("t-SNE 1")
    plt.ylabel("t-SNE 2")
    plt.show()


In [15]:
def histograms(cleaned):
    # Visualize distributions of values
    for col in cleaned.select_dtypes(include="number"):
        sns.histplot(cleaned[col].dropna(), kde=True)
        plt.title(col)
        plt.show()

In [17]:
def scatter_plot(cleaned):
    for i in cleaned.columns:
        for j in cleaned.columns:
            sns.scatterplot(
                data=cleaned,
                x=i,   # choose your x-axis column
                y=j,    # choose your y-axis column
                alpha=0.6
            )
            plt.title(f"Scatter plot: {i} vs {j}")
            plt.show()

In [18]:
def analyze( label_name ):

    # Compare the preprocessed and the final datasets
    dataset = basic_info(label_name)

    # Prepare the dataset given that satjob or stafin might not have been generated yet
    if label_name == "satjob":
        cleaned = dataset.dropna(axis=1, how='all').drop(columns=["satfin"], errors="ignore")
    elif label_name == "satfin":
        cleaned = dataset.dropna(axis=1, how='all').drop(columns=["satjob"], errors="ignore")
    else:
        raise RuntimeError("Wrong label(target) name")

    import sys
    sys.exit()

    #T-sne
    print("T-SNE")
    tsne(cleaned)

    # Histograms
    print("Histograms")

    # Scatter plots
    print("Scatter plots")
    scatter_plot(cleaned)

    # Time correlations
    print("Time correlations")

    # Finding common patterns with FP-tree (e.g., what kind of degree brings satisfaction)

    # Evaluating the correlations



In [19]:
# Satjob
analyze("satjob")

-- Preprocessed dataset --
# of Missing/NaN values: 15319701

Value counts (raw): 
satjob
1.0    2681
2.0    2124
3.0     491
4.0     227
Name: count, dtype: int64

Value counts (%): 
satjob
1.0    48.542459
2.0    38.457360
3.0     8.890096
4.0     4.110085
Name: proportion, dtype: float64



-- Final dataset --
# of Missing/NaN values: 0

Value counts (raw): 
satjob
1.0    13907166
2.0     1417340
3.0         491
4.0         227
Name: count, dtype: int64

Value counts (%): 
satjob
1.0    90.746902
2.0     9.248413
3.0     0.003204
4.0     0.001481
Name: proportion, dtype: float64


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Satfin
analyze("satfin")