In [2]:
#importing necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [3]:
# importing and separating data
NASCOTUS_fixed = pd.read_csv('NASCOTUS_fixed.csv')

# making X data and Y data
X = NASCOTUS_fixed.drop('justicesDecision', axis=1)
y = NASCOTUS_fixed['justicesDecision']

In [4]:
# have to do preprocessing and one hot encode since the variables are categorical
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X)

In [5]:
# then I split data 
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.1, random_state=42)

In [6]:
# now I train random forest classifier
randfor = RandomForestClassifier(n_estimators=1, class_weight = 'balanced', random_state=42) # n_estimators means the amount of trees 

randfor.fit(X_train, y_train) #training the model 


RandomForestClassifier(class_weight='balanced', n_estimators=1, random_state=42)

In [7]:
# now I a making predictions and classifying the model 

predictions = randfor.predict(X_test)

# accuracy calculation
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.4989876966204641


In [8]:
importances = randfor.feature_importances_ #feature importances 


# now relating the encoded features back to the acutal ones
feature_names = encoder.get_feature_names_out(X.columns)
original_feature_importance = dict.fromkeys(X.columns, 0)

for encoded_feature, importance in zip(feature_names, importances):
    original_feature = encoded_feature.split('_')[0]  # using default one hot encoding feature naming
    original_feature_importance[original_feature] += importance

sorted_by_importance = sorted(original_feature_importance.items(), key=lambda x: x[1], reverse=True)

# printing the sorted features now and their importance
for feature, importance in sorted_by_importance:
    print(f"{feature}: {importance}")


certReason: 0.3996515190966594
lawType: 0.37829801360417986
lcDispositionDirection: 0.15270551360899132
presAffiliation: 0.06934495369016945


In [9]:
# Predict on the training data
train_predictions = randfor.predict(X_train)

# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, train_predictions)
print(f'Training Accuracy: {train_accuracy}')

Training Accuracy: 0.5072080023536334


In [55]:
# notes
    This model is saying that certReason is the most important feature, then lawType, then lcDispositionDirection,
    then presAffiliation. The numbers are percentages. 

IndentationError: unexpected indent (4292286074.py, line 2)

In [11]:
# now we need to try this with witholding data right. So basically your train needs to just be a certain subset of cases (partition by caseId, then your test needs to be the rest)

data = pd.read_csv('all_data.csv')


# gettign a list of unique_caseIds
unique_caseIds = data['caseId'].unique().tolist()

# randomizing this list
import random

random.shuffle(unique_caseIds)



['2021-051',
 '2021-050',
 '2021-010',
 '2021-049',
 '2021-009',
 '2021-053',
 '2021-048',
 '2021-008',
 '2021-011',
 '2021-047',
 '2021-007',
 '2020-039',
 '2021-055',
 '2021-046',
 '2021-043',
 '2021-006',
 '2021-012',
 '2021-044',
 '2021-045',
 '2021-005',
 '2020-040',
 '2021-056',
 '2021-041',
 '2021-004',
 '2020-068',
 '2021-013',
 '2021-040',
 '2021-003',
 '2020-067',
 '2020-042',
 '2021-057',
 '2021-042',
 '2021-039',
 '2020-070',
 '2021-014',
 '2021-035',
 '2021-038',
 '2021-001',
 '2020-043',
 '2021-058',
 '2021-037',
 '2020-074',
 '2020-071',
 '2021-015',
 '2021-036',
 '2020-073',
 '2021-002',
 '2020-044',
 '2021-059',
 '2020-072',
 '2020-061',
 '2020-029',
 '2021-016',
 '2021-031',
 '2021-034',
 '2020-025',
 '2020-046',
 '2021-060',
 '2021-033',
 '2020-062',
 '2020-030',
 '2020-001',
 '2021-032',
 '2021-022',
 '2020-026',
 '2020-047',
 '2021-061',
 '2021-021',
 '2020-064',
 '2020-031',
 '2020-002',
 '2021-026',
 '2021-023',
 '2020-027',
 '2020-049',
 '2021-062',
 '2021-029',

In [9]:
duplicates = eighty.duplicated(subset=['caseId'])

# Checking if there are any duplicates
if duplicates.any():
    print("There are duplicates in the 'caseId' column.")
else:
    print("There are no duplicates in the 'caseId' column.")

There are duplicates in the 'caseId' column.
