In [1]:
import pandas as pd



In [15]:
# Replace the file path with the location of your CSV file
file_path = '../Data/Raw/cox-violent-parsed_filt.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame to check if it's loaded correctly
df.head()

Unnamed: 0,id,name,first,last,sex,dob,age,age_cat,race,juv_fel_count,...,vr_offense_date,vr_charge_desc,type_of_assessment,decile_score,score_text,screening_date,v_type_of_assessment,v_decile_score,v_score_text,event
0,1,miguel hernandez,miguel,hernandez,Male,18/04/1947,69,Greater than 45,Other,0,...,,,Risk of Recidivism,1,Low,14/08/2013,Risk of Violence,1,Low,0
1,2,michael ryan,michael,ryan,Male,6/2/1985,31,25 - 45,Caucasian,0,...,,,Risk of Recidivism,5,Medium,31/12/2014,Risk of Violence,2,Low,0
2,3,kevon dixon,kevon,dixon,Male,22/01/1982,34,25 - 45,African-American,0,...,5/7/2013,Felony Battery (Dom Strang),Risk of Recidivism,3,Low,27/01/2013,Risk of Violence,1,Low,1
3,4,ed philo,ed,philo,Male,14/05/1991,24,Less than 25,African-American,0,...,,,Risk of Recidivism,4,Low,14/04/2013,Risk of Violence,3,Low,0
4,5,marcu brown,marcu,brown,Male,21/01/1993,23,Less than 25,African-American,0,...,,,Risk of Recidivism,8,High,13/01/2013,Risk of Violence,6,Medium,0


| **Feature**                | **Explanation**                                                                 | **Possible Values (Range)**                                          | **Example**                       |
|----------------------------|---------------------------------------------------------------------------------|----------------------------------------------------------------------|-----------------------------------|
| **Case_ID**                | Unique identifier for each case                                                 | Numeric or alphanumeric codes                                        | 51950                             |
| **Agency_Text**            | The agency handling the case                                                    | Text labels (e.g., “PRETRIAL”)                                         | PRETRIAL                          |
| **LastName**               | Defendant’s last name                                                           | Text                                                                 | Fisher                            |
| **FirstName**              | Defendant’s first name                                                          | Text                                                                 | Kevin                             |
| **MiddleName**             | Defendant’s middle name (if available)                                          | Text or NULL                                                         | NULL                              |
| **Sex_Code_Text**          | Defendant’s gender                                                              | “Male” or “Female”                                                   | Male                              |
| **Ethnic_Code_Text**       | Defendant’s race/ethnicity                                                      | Categories such as “Caucasian”, “African-American”, etc.             | Caucasian                         |
| **DateOfBirth**            | Defendant’s date of birth                                                       | Date format (month/day/year)                                           | 12/5/1992                         |
| **ScaleSet_ID**            | Identifier for the assessment scale set used                                    | Numeric (e.g., an integer code)                                      | 22                                |
| **ScaleSet**               | Name/description of the assessment scale                                        | Text                                                                 | Risk and Prescreen                |
| **AssessmentReason**       | Reason for conducting the COMPAS assessment                                     | Text (e.g., “Intake”)                                                  | Intake                            |
| **Language**               | Defendant’s primary language                                                    | Text (e.g., “English”)                                                 | English                           |
| **LegalStatus**            | Defendant’s legal status at the time of assessment                              | Text (e.g., “Pretrial”)                                                | Pretrial                          |
| **CustodyStatus**          | Custody status of the defendant                                                 | Text (e.g., “Jail Inmate”)                                             | Jail Inmate                       |
| **MaritalStatus**          | Defendant’s marital status                                                      | Categories like “Single”, “Married”, “Significant Other”               | Single                            |
| **Screening_Date**         | Date when the screening/assessment was conducted                                | Date and time format                                                 | 1/1/2013 0:00                     |
| **RecSupervisionLevel**    | Numerical indicator of recommended supervision level                            | Numeric values (e.g., 1, 2, 4)                                         | 1                                 |
| **RecSupervisionLevelText**| Textual description of the recommended supervision level                        | Text categories such as “Low”, “Medium”, “High”                        | Low                               |
| **Scale_ID**               | Identifier for the specific assessment scale used                               | Numeric (varies per row; e.g., 7, 8, 18)                               | 7                                 |
| **DisplayText**            | Description of the risk category assessed (e.g., type of risk)                    | Text (e.g., “Risk of Violence”, “Risk of Recidivism”, “Risk of Failure to Appear”) | Risk of Violence                  |
| **RawScore**               | Raw numerical score produced by the COMPAS assessment                           | Numeric (can be negative or positive, e.g., -2.08, -1.06, 15)          | -2.08                           |
| **DecileScore**            | Risk score expressed as a decile (ranking from 1 to 10)                          | Integer from 1 to 10                                                   | 4                                 |
| **ScoreText**              | Textual interpretation of the risk score                                        | Text (e.g., “Low”, “High”)                                             | Low                               |
| **AssessmentType**         | Type of assessment conducted                                                    | Text (e.g., “New”)                                                     | New                               |
| **IsCompleted**            | Indicator whether the assessment was completed                                 | Binary (1 for yes, 0 for no)                                           | 1                                 |
| **IsDeleted**              | Indicator whether the record has been deleted                                  | Binary (0 for active, 1 for deleted)                                   | 0                                 |


In [24]:
columns_to_drop = [
    'id', 'name', 'first', 'last', 'dob', 'violent_recid', 'c_jail_in', 'c_jail_out', 
    'c_days_from_compas', 'c_charge_desc', 'r_offense_date', 'r_jail_in', 'vr_offense_date', 
    'vr_charge_desc', 'type_of_assessment', 'screening_date', 'v_type_of_assessment', 
    'v_decile_score', 'v_score_text', 'age_cat', 'is_violent_recid', 'vr_charge_degree'
]

# Drop the irrelevant columns
df2 = df.drop(columns=columns_to_drop)

# Display the resulting DataFrame (optional)
df2.head()

Unnamed: 0,sex,age,race,juv_fel_count,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_charge_degree,is_recid,r_charge_degree,r_days_from_arrest,r_charge_desc,decile_score,score_text,event
0,Male,69,Other,0,0,0,0,-1.0,(F3),0,,,,1,Low,0
1,Male,31,Caucasian,0,0,0,0,,,-1,,,,5,Medium,0
2,Male,34,African-American,0,0,0,0,-1.0,(F3),1,(F3),,Felony Battery (Dom Strang),3,Low,1
3,Male,24,African-American,0,0,1,4,-1.0,(F3),1,(M1),0.0,Driving Under The Influence,4,Low,0
4,Male,23,African-American,0,1,0,1,,(F3),0,,,,8,High,0


In [20]:
target = df2['decile_score'] 
features = df2.drop('decile_score',axis = 1)
