In [1]:
#importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
# importing and separating data
NASCOTUS_fixed = pd.read_csv('NASCOTUS_fixed.csv')

# making X data and Y data
X = NASCOTUS_fixed.drop('justicesDecision', axis=1)
y = NASCOTUS_fixed['justicesDecision']

In [3]:
# have to do preprocessing and one hot encode since the variables are categorical
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

In [4]:
# then I split data 
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

In [5]:
# now I train random forest classifier
randfor = RandomForestClassifier(n_estimators=1, class_weight = 'balanced', random_state=42) # n_estimators means the amount of trees 

randfor.fit(X_train, y_train) #training the model 


RandomForestClassifier(class_weight='balanced', n_estimators=1, random_state=42)

In [6]:
# now I a making predictions and classifying the model 

predictions = randfor.predict(X_test)

# accuracy calculation
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.4989876966204641


In [7]:
importances = randfor.feature_importances_ #feature importances 


# now relating the encoded features back to the acutal ones
feature_names = encoder.get_feature_names_out(X.columns)
original_feature_importance = dict.fromkeys(X.columns, 0)

for encoded_feature, importance in zip(feature_names, importances):
    original_feature = encoded_feature.split('_')[0]  # using default one hot encoding feature naming
    original_feature_importance[original_feature] += importance

sorted_by_importance = sorted(original_feature_importance.items(), key=lambda x: x[1], reverse=True)

# printing the sorted features now and their importance
for feature, importance in sorted_by_importance:
    print(f"{feature}: {importance}")


certReason: 0.3996515190966594
lawType: 0.37829801360417986
lcDispositionDirection: 0.15270551360899132
presAffiliation: 0.06934495369016945


In [8]:
# Predict on the training data
train_predictions = randfor.predict(X_train)

# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, train_predictions)
print(f'Training Accuracy: {train_accuracy}')

Training Accuracy: 0.5072080023536334


In [9]:
# notes
    This model is saying that certReason is the most important feature, then lawType, then lcDispositionDirection,
    then presAffiliation. The numbers are percentages. 

IndentationError: unexpected indent (4292286074.py, line 2)

In [11]:
# dividing so there are no instances of caseIds informing the model.

data_uncleaned = pd.read_csv('all_data.csv')
data_uncleaned = data_uncleaned.drop(['justiceName', 'fullJusticeName', 'dateDecision'], axis=1)
data_uncleaned.dropna(axis=0, inplace=True)

# one hot encoding for every column except caseId
categorical_columns = [col for col in data_uncleaned.columns if col not in ['caseId', 'justicesDecision']]


# categorical columns
data = pd.get_dummies(data_uncleaned, columns=categorical_columns)


unique_caseIds = data['caseId'].unique().tolist()

# randomizing this list
import random

random.shuffle(unique_caseIds)

eighty_percent_length = int(len(unique_caseIds) * 0.6)


eighty_subset = unique_caseIds[:eighty_percent_length]
twenty_subset = unique_caseIds[eighty_percent_length:]

# making eighty/20 which will have that first 80% of caseIds, last 20%. you get it.
eighty = data[data['caseId'].isin(eighty_subset)].copy()
twenty = data[data['caseId'].isin(twenty_subset)].copy()
# have to reset index
eighty.reset_index(drop=True, inplace=True)
twenty.reset_index(drop=True, inplace=True)

# NOW U CAN proceed.


In [12]:
eighty = eighty.dropna()
twenty = twenty.dropna()

In [13]:
# training the model part two


training_set_x = eighty.drop(['justicesDecision', 'caseId'], axis=1)
training_set_y = eighty['justicesDecision']

testing_set_x =  twenty.drop(['justicesDecision', 'caseId'], axis=1)
testing_set_y = twenty['justicesDecision']

# make model
randfor2 = RandomForestClassifier(n_estimators=3, random_state=42)
randfor2.fit(training_set_x, training_set_y) #training the model 


predictions = randfor2.predict(testing_set_x)


accuracy = accuracy_score(testing_set_y, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 1.0


In [14]:
# doing feature importances
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd


feature_importances = randfor2.feature_importances_


original_columns = []
for col in training_set_x.columns:
    if '_' in col:
        original_col = col.split('_')[0]
        if original_col not in original_columns:
            original_columns.append(original_col)
    else:
        original_columns.append(col)

# sorting by importance
importance_dict = {feature_name: importance for importance, feature_name in zip(feature_importances, original_columns)}
sorted_importance_dict = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print feature importances in order of importance
print("Feature Importances:")
for feature_name, importance in sorted_importance_dict.items():
    print(f"{feature_name}: {importance}")

Feature Importances:
decisionType: 0.0
term: 0.0
naturalCourt: 0.0
chief: 0.0
petitioner: 0.0
respondent: 0.0
caseSource: 0.0
certReason: 0.0
lcDispositionDirection: 0.0
declarationUncon: 0.0
caseDisposition: 0.0
partyWinning: 0.0
precedentAlteration: 0.0
issue: 0.0
issueArea: 0.0
decisionDirection: 0.0
decisionDirectionDissent: 0.0
authorityDecision1: 0.0
authorityDecision2: 0.0
lawType: 0.0
lawSupp: 0.0
justice: 0.0
vote: 0.0
direction: 0.0
majority: 0.0
presAppointed: 0.0
presAffiliation: 0.0
