In [None]:
# library importer 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
# csv importer. 
NASCOTUS_fixed = pd.read_csv('NASCOTUS_fixed.csv')
print(NASCOTUS_fixed)

       lawType  lcDispositionDirection  certReason  presAffiliation  \
0            5                       1          11                1   
1            3                       2          12                1   
2            1                       1          11                1   
3            3                       1           2                1   
4            3                       2          12                1   
...        ...                     ...         ...              ...   
64199        6                       2          10                0   
64200        3                       1          10                0   
64201        1                       2           1                0   
64202        3                       1           1                0   
64203        3                       1          12                0   

       justicesDecision  
0                     0  
1                     0  
2                     1  
3                     0  
4                

In [8]:
# Now I am separating data into the predictors and outcomes
X = NASCOTUS_fixed['presAffiliation'] # dropping an extra column to see differences in accuracy scores
y = NASCOTUS_fixed['justicesDecision']  # Target variable

# ONE HOT ENCODING EVERYTHINGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
X = X.to_frame()

categorical_features = X.columns.tolist() 
categorical_transformer = OneHotEncoder(drop='first')  # One-hot encode; drop one to avoid dummy trap

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

# ask about this
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SVC(probability=True))])  # probability=True is needed for certain metrics

In [9]:
# classic train test splitting 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# model training 

clf.set_params(classifier__class_weight='balanced')
clf.fit(X_train, y_train)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(drop='first'),
                                                  ['presAffiliation'])])),
                ('classifier', SVC(class_weight='balanced', probability=True))])

In [11]:
# Making predictions
y_pred = clf.predict(X_test)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Predict on the training data
y_train_pred = clf.predict(X_train)

# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)


Accuracy: 0.5755782259948602
Confusion Matrix:
[[6357 4071]
 [1379 1034]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.61      0.70     10428
           1       0.20      0.43      0.28      2413

    accuracy                           0.58     12841
   macro avg       0.51      0.52      0.49     12841
weighted avg       0.71      0.58      0.62     12841

Training Accuracy: 0.5740902984638748


In [None]:
Notes: 
    - i did the line in the model training box that sets the classifier_class_weight to 'balanced' so that the sample numbers could be balanced 