In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import numpy as np

import re

import warnings
warnings.filterwarnings("ignore")

In [4]:
file_path = './data/'

In [None]:
family_income = pd.read_excel(file_path + "Family_Income.xlsx")
family_income.head()

Unnamed: 0,STUDENT_ID,Family_Income
0,1,109770.95
1,2,58442.99
2,3,97125.87
3,4,55256.4
4,5,58187.16


In [None]:
credit_rating = pd.read_excel(file_path + "CreditRating Data 2012.xlsx")
credit_rating.head()

Unnamed: 0,STUDENT_ID,Credit_Score_Raw,Credit_Score_AgeAdj
0,1,0.495227,0.767967
1,2,0.987649,0.529815
2,3,0.249315,0.589539
3,4,0.653183,0.805702
4,5,0.273099,0.415433


In [None]:
# Read the .pl file line by line
with open(file_path + "enlist.pl", 'r') as file:
    lines = file.readlines()

# Let's take a look at the file (first 5 rows)
for line in lines[:5]: # Print the first 5 rows
    print(line)

% enlist(student,Service).

% student is enlisted in Service.



enlist(student40,fire_department).

enlist(student51,fire_department).



In [8]:
# Extract student number and service
data = []
for line in lines: # for each row
    match = re.match(r'enlist\(student(\d+),(\w+)\)\.', line.strip())
    if match:
        student_number = int(match.group(1)) # Extract the student ID and convert to integer
        service = match.group(2)             # the text after the comma
        data.append((student_number, service))


# Convert to DataFrame
enlist = pd.DataFrame(data, columns=['STUDENT_ID', 'Service'])
enlist.head()

Unnamed: 0,STUDENT_ID,Service
0,40,fire_department
1,51,fire_department
2,109,fire_department
3,139,fire_department
4,148,fire_department


In [9]:
def parse_prolog_file(file_name):
    """
    Parses a .pl Prolog-like file and returns a DataFrame with:
    - Function (e.g., 'enlist', 'no_payment_due')
    - StudentNumber (e.g., 1000)
    - Label (e.g., 'fire_department', 'pos', 'neg')
    """
    pattern = r'(\w+)\(student(\d+),(\w+)\)\.'

    parsed_data = []
    with open(file_path + file_name, 'r') as f:
        for line in f:
            match = re.match(pattern, line.strip())
            if match:
                function_name = match.group(1)
                student_number = int(match.group(2))
                label = match.group(3)
                parsed_data.append((student_number, label))

    return pd.DataFrame(parsed_data, columns=['STUDENT_ID', function_name])

In [10]:
enlist_df = parse_prolog_file("enlist.pl")
enlist_df.head()

Unnamed: 0,STUDENT_ID,enlist
0,40,fire_department
1,51,fire_department
2,109,fire_department
3,139,fire_department
4,148,fire_department


In [11]:
nopaymentdue = parse_prolog_file("no_payment_due.pl")
nopaymentdue.head()

Unnamed: 0,STUDENT_ID,no_payment_due
0,1000,pos
1,999,pos
2,998,pos
3,996,pos
4,994,pos


In [12]:
print(len(family_income))
print(len(credit_rating))
print(len(enlist_df))
print(len(nopaymentdue))

1000
1000
306
1000


In [13]:
# Merge family_income with credit_rating

merged_df = pd.merge(family_income, credit_rating, on='STUDENT_ID', how='left')
merged_df.head()

Unnamed: 0,STUDENT_ID,Family_Income,Credit_Score_Raw,Credit_Score_AgeAdj
0,1,109770.95,0.495227,0.767967
1,2,58442.99,0.987649,0.529815
2,3,97125.87,0.249315,0.589539
3,4,55256.4,0.653183,0.805702
4,5,58187.16,0.273099,0.415433


In [14]:
merged_df = pd.merge(merged_df, enlist_df, on='STUDENT_ID', how='left')
merged_df.head()

Unnamed: 0,STUDENT_ID,Family_Income,Credit_Score_Raw,Credit_Score_AgeAdj,enlist
0,1,109770.95,0.495227,0.767967,
1,2,58442.99,0.987649,0.529815,
2,3,97125.87,0.249315,0.589539,
3,4,55256.4,0.653183,0.805702,marines
4,5,58187.16,0.273099,0.415433,


In [15]:
merged_df = pd.merge(merged_df, nopaymentdue, on='STUDENT_ID', how='left')
merged_df.head()

Unnamed: 0,STUDENT_ID,Family_Income,Credit_Score_Raw,Credit_Score_AgeAdj,enlist,no_payment_due
0,1,109770.95,0.495227,0.767967,,pos
1,2,58442.99,0.987649,0.529815,,pos
2,3,97125.87,0.249315,0.589539,,neg
3,4,55256.4,0.653183,0.805702,marines,pos
4,5,58187.16,0.273099,0.415433,,neg
