In [1]:
# this file will do random forests separately by party to see what happens

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
# importing and separating data
NASCOTUS_fixed = pd.read_csv('NASCOTUS_fixed.csv')

In [3]:
# conservative modeling
conservative_rf = NASCOTUS_fixed[NASCOTUS_fixed['presAffiliation'] == 1].copy()
conservative_rf.drop('presAffiliation', axis=1, inplace=True)
# one hot
X_con = NASCOTUS_fixed.drop(['justicesDecision', 'certReason'], axis=1)
y_con = NASCOTUS_fixed['justicesDecision']

encoder_con = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded_con = encoder_con.fit_transform(X_con)
#split train test
# then I split data 
X_train_con, X_test_con, y_train_con, y_test_con = train_test_split(X_encoded_con, y_con, test_size=0.2, random_state=42)

# now I train random forest classifier
randfor_con = RandomForestClassifier(n_estimators=11, class_weight = 'balanced', random_state=42) # n_estimators means the amount of trees 

randfor_con.fit(X_train_con, y_train_con) #training the model 


# Making predictions for both train and test data
train_predictions_con = randfor_con.predict(X_train_con)
test_predictions_con = randfor_con.predict(X_test_con)

# Calculating accuracy for both train and test data
train_accuracy_con = accuracy_score(y_train_con, train_predictions_con)
test_accuracy_con = accuracy_score(y_test_con, test_predictions_con)

# Printing accuracies
print(f'Training Accuracy: {train_accuracy_con}')
print(f'Test Accuracy: {test_accuracy_con}')




importances_con = randfor_con.feature_importances_ #feature importances 


# now relating the encoded features back to the acutal ones
feature_names_con = encoder_con.get_feature_names_out(X_con.columns)
original_feature_importance_con = dict.fromkeys(X_con.columns, 0)

for encoded_feature_con, importance_con in zip(feature_names_con, importances_con):
    original_feature_con= encoded_feature_con.split('_')[0]  # using default one hot encoding feature naming
    original_feature_importance_con[original_feature_con] += importance_con 

sorted_by_importance_con = sorted(original_feature_importance_con.items(), key=lambda x: x[1], reverse=True)

# printing the sorted features now and their importance
for feature_con, importance_con in sorted_by_importance_con:
    print(f"{feature_con}: {importance_con}")


Training Accuracy: 0.5418102525164029
Test Accuracy: 0.5438049996106222
lawType: 0.5903521074183662
presAffiliation: 0.21815936270060834
lcDispositionDirection: 0.19148852988102524


In [4]:
# liberal modeling
liberal_rf = NASCOTUS_fixed[NASCOTUS_fixed['presAffiliation'] == 0].copy()
liberal_rf.drop('presAffiliation', axis=1, inplace=True)
# one hot
X_lib = liberal_rf.drop('justicesDecision', axis=1)
y_lib = liberal_rf['justicesDecision']

encoder_lib = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded_lib = encoder_lib.fit_transform(X_lib)
X_train_lib, X_test_lib, y_train_lib, y_test_lib = train_test_split(X_encoded_lib, y_lib, test_size=0.1, random_state=42)


# now I train random forest classifier
randfor_lib = RandomForestClassifier(n_estimators=110, class_weight = 'balanced', random_state=42) # n_estimators means the amount of trees 

randfor_lib.fit(X_train_lib, y_train_lib) #training the model 



# Making predictions for both train and test data
train_predictions_lib = randfor_lib.predict(X_train_lib)
test_predictions_lib = randfor_lib.predict(X_test_lib)

# Calculating accuracy for both train and test data
train_accuracy_lib = accuracy_score(y_train_lib, train_predictions_lib)
test_accuracy_lib = accuracy_score(y_test_lib, test_predictions_lib)

# Printing accuracies
print(f'Training Accuracy: {train_accuracy_lib}')
print(f'Test Accuracy: {test_accuracy_lib}')



importances_lib = randfor_lib.feature_importances_ #feature importances 


# now relating the encoded features back to the acutal ones
feature_names_lib = encoder_lib.get_feature_names_out(X_lib.columns)
original_feature_importance_lib = dict.fromkeys(X_lib.columns, 0)

for encoded_feature_lib, importance_lib in zip(feature_names_lib, importances_lib):
    original_feature_lib = encoded_feature_lib.split('_')[0]  # using default one hot encoding feature naming
    original_feature_importance_lib[original_feature_lib] += importance_lib

sorted_by_importance_lib = sorted(original_feature_importance_lib.items(), key=lambda x: x[1], reverse=True)

# printing the sorted features now and their importance
for feature_lib, importance_lib in sorted_by_importance_lib:
    print(f"{feature_lib}: {importance_lib}")


Training Accuracy: 0.6382192428344291
Test Accuracy: 0.6117556071152359
lawType: 0.41569001006796413
certReason: 0.4050576761949491
lcDispositionDirection: 0.1792523137370868


In [None]:
# 7 percent between these two which is ... interesting?