In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from data_utils import get_employment
all_data, features, target, group = get_employment()

# Assuming 'all_data' dataframe is already defined
df = all_data.copy()

df = df.drop(columns=['CIT', 'MIG', 'DEAR', 'DEYE', 'NATIVITY', 'ANC'])

# Target column
target_col = 'ESR'

# Feature columns
feature_cols = [col for col in df.columns if col != target_col]

# Convert all columns to categorical
for col in df.columns:
    df[col] = df[col].astype('category')

# Prepare the dataset
X = df[feature_cols]
y = df[target_col]

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Find the most influential features
importances = clf.feature_importances_
feature_importances = sorted(zip(feature_cols, importances), key=lambda x: x[1], reverse=True)

print("Feature importances:")
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Function to train and evaluate a model, returning the accuracy
def train_and_evaluate(X_train, X_test, y_train, y_test):
    # Train the classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Train and evaluate the initial model
initial_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)
print("Initial Model Accuracy:", initial_accuracy)

# Store performance for each value of k
performance = []
k_values = list(range(7))

for k in k_values:
    # Remove k least important features
    least_important_features = sorted(feature_importances, key=lambda x: x[1])[:k]
    least_important_features = [feature for feature, _ in least_important_features]
    
    # Prepare the dataset without k least important features
    X_train_new = X_train.drop(columns=least_important_features)
    X_test_new = X_test.drop(columns=least_important_features)
    
    # Train and evaluate the new model
    new_accuracy = train_and_evaluate(X_train_new, X_test_new, y_train, y_test)
    performance.append(new_accuracy)

# Create a line plot with k on the x-axis and performance on the y-axis
performance_data = pd.DataFrame({"k": k_values, "accuracy": performance})
sns.lineplot(x="k", y="accuracy", data=performance_data)
plt.xlabel("Number of least important features removed")
plt.ylabel("Model accuracy")
plt.title("Model performance as we remove more unimportant features")
plt.show()