## Reading in files

In [None]:
file_idn = '1DsX8MXyVJg5OWJn3cWkA-Onx-IaBmzNT'
file_id_train = '1Fycjk_nlRbnlYGvDd2NfNSqnVk0daHie'
file_id_test = '11l896qUmNpJ4zR2GzJ9y-w1ydSHIB3OB'
file_id_dict = '1XyhjzSuZu3-Rego09BGdwznJcuo-4sBk'

!gdown {file_idn} -O novice.xslx
!gdown {file_id_train} -O train.xslx
!gdown {file_id_test} -O test.xslx
!gdown {file_id_dict} -O dict.xslx     # this block of code should download the relevant files.


Downloading...
From: https://drive.google.com/uc?id=1DsX8MXyVJg5OWJn3cWkA-Onx-IaBmzNT
To: /content/novice.xslx
100% 1.80M/1.80M [00:00<00:00, 17.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Fycjk_nlRbnlYGvDd2NfNSqnVk0daHie
To: /content/train.xslx
100% 25.3M/25.3M [00:00<00:00, 69.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=11l896qUmNpJ4zR2GzJ9y-w1ydSHIB3OB
To: /content/test.xslx
100% 2.12M/2.12M [00:00<00:00, 30.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1XyhjzSuZu3-Rego09BGdwznJcuo-4sBk
To: /content/dict.xslx
100% 40.9k/40.9k [00:00<00:00, 50.3MB/s]


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_dct = pd.read_excel('dict.xslx', index_col=0)
df_train = pd.read_excel('train.xslx')
df_test = pd.read_excel('test.xslx')
df_novice = pd.read_excel('novice.xslx')  # this reads the files into DFs properly.

## Replacing all acronymns

In [None]:
import pandas as pd

# Assuming df_train is already defined and contains the 'Match Support Contact Notes' column

# Case-sensitive replacements
case_sensitive_mappings = {
    'Big': 'mentor',
    'Little': 'child',
    'B_first_name': "mentor",
    'B_last_name': "mentor",
    'L_first_name': "child",
    'L_last_name': "child",
    'Bigs': 'mentors',
    'Littles': 'children'
}

# Case-insensitive replacements
word_mappings = {
        'msp': 'match support place',
    'msw': 'team',
    'msc': 'team',
    'mst': 'team',
    'ms': 'team',
    'mss': 'team',
        'emc': 'team',
    'mec': 'team',
    'bbbs': 'team',
    'bbb': 'team',
    'bbs': 'team',
    'bsw': 'team',
    'pc' : 'parent',
        'pg': 'parent',
        'guardian': 'parent',
    'mc': 'team',
        'son':'child',
    'match engagement coordinator': 'team',
    'big brothers big sisters of the greater twin cities': 'team',
    'big brothers big sisters': 'team',
    'bs': 'mentor',
    'bb': 'mentor',
    'bcs': 'mentor',
    'bss': 'mentor',
    'bd': 'mentor',
    'bc': 'mentor',
    'bl': 'mentor',
    'Ls': 'child',
    'Lb': 'child',
    'pc': 'team',
    'msg': 'message',
    'lol': 'laugh',
    'mia': 'missing in action',
    'btw': 'by the way',
    'bf': 'boyfriend',
    'bff': 'best friend',
    'lib': 'library',
    'lmk': 'let me know',
    'appt': 'appointment',
    'apt': 'apartment',
    'asap': 'as soon as possible',
    'convo': 'conversation',
    'fyi': 'for your information',
    'mins': 'minutes',
    'mtg': 'meeting',
    'pls': 'please',
    'thx': 'thanks',
    'ty': 'thank you',
    'w/': 'with',
    'w/o': 'without',
    'yr': 'year',
    'yrs': 'years',
    'b/c': 'because',
    'b/w': 'between',
    'info': 'information',
    'intro': 'introduction',
    'comm': 'communication',
    'ofc': 'of course',
    'idk': "I don't know",
    'imo': 'in my opinion',
    'rn': 'right now',
    'bc': 'because',
    'tmrw': 'tomorrow',
    'def': 'definitely',

}





In [None]:
# Apply case-sensitive replacements
df_train['Match Support Contact Notes'] = df_train['Match Support Contact Notes'].replace(case_sensitive_mappings, regex=True)

# Apply case-insensitive replacements
df_train['Match Support Contact Notes'] = df_train['Match Support Contact Notes'].replace(
    {fr'(?i)\b{k}\b': v for k, v in word_mappings.items()}, regex=True
)


In [None]:
# Display updated DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)
df_train[['Match Support Contact Notes']].head(10)

Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


Unnamed: 0,Match Support Contact Notes
0,Question: Activities: Answer: See te...
1,Question: Activities: Answer: See te...
2,Question: Activities: Answer: Match ...
3,Question: Activities: Answer: team a...
4,Question: Activities: Answer: team a...
5,Question: Activities: Answer: team a...
6,Question: Activities: Answer: team S...
7,Question: Activities: Answer: team a...
8,Question: Activities: Answer: team a...
9,


# Clean

In [None]:
def reformat_questions(text):
    if pd.isna(text) or not isinstance(text, str) or not text.strip():
        return text

    # Split the text based on occurrences of "Question:"
    parts = re.split(r'(?i)(?=Question:)', text)  # Keep "Question:" as a delimiter

    formatted_parts = []

    for part in parts:
        part = part.strip()

        # If part starts with "Question:", extract Q&A
        if part.lower().startswith("question:"):
            qa_match = re.search(r'(?i)Question:\s*(.*?)(?:\s*Answer:\s*(.*))?$', part, re.DOTALL)

            if qa_match:
                question = qa_match.group(1).strip()
                answer = qa_match.group(2).strip() if qa_match.group(2) else ""

                # Skip empty or invalid answers
                if answer and answer not in {".", "-", ''} and not re.search(r'see\s*\.{0,3}\s*(?:team)?\s*notes', answer, re.IGNORECASE):
                    formatted_parts.append(f"{question}{answer}")
        else:
            formatted_parts.append(part)  # Keep non-question text unchanged

    return ' '.join(formatted_parts)  # Reconstruct the text


In [None]:
df_train['Match Support Contact Notes'] = df_train['Match Support Contact Notes'].apply(reformat_questions)

In [None]:
df_train[['Match Support Contact Notes']].head(20)

Unnamed: 0,Match Support Contact Notes
0,team Notes:Match Closing with mentor via emai...
1,team Notes:See communication log for details ...
2,Activities:Match Closed. team Notes:Match Clo...
3,Activities:team asked about activities. mento...
4,Activities:team asked about activities. mento...
5,Activities:team asked about activities. mento...
6,Activities:team Summer Picnic Support Form ch...
7,Activities:team asked about activities. mento...
8,Activities:team asked about activities. mento...
9,


In [None]:
unwanted_phrases = [
   'team asked about activities.','activities.', "Question:", "Answer:", "see team notes", "Activities", "Child Safety", "Child Development",
    "Child/Volunteer Relationship development", "Relationship with team", "Parent/Volunteer Concerns",
    "Other Comments-List progress/activities in school and in JJ System", "team Notes", 'Match Closed.', 'N/A', ':'
]

def clean_text(text):
    # Remove unwanted phrases
    if isinstance(text, str):
      for phrase in unwanted_phrases:
          text = text.replace(phrase, "")

      # Extract only the correct answers marked with 'X'
      correct_answers = re.findall(r'X([A-Za-z\s]+)', text)

      return ', '.join(correct_answers) if correct_answers else text
    else:
      return text

# Apply function
df_train['Match Support Contact Notes'] = df_train['Match Support Contact Notes'].apply(clean_text)

In [None]:
import numpy as np

df_train['Match Support Contact Notes'] = df_train['Match Support Contact Notes'].astype(str).str.strip()
df_train['Match Support Contact Notes'] = df_train['Match Support Contact Notes'].replace("", np.nan)

In [None]:
df_train[['Match Support Contact Notes']].head(20)

Unnamed: 0,Match Support Contact Notes
0,"Match Closing with mentor via email Hi Mai, Th..."
1,See communication log for details on Email com...
2,
3,mentor said this month has been busy for her; ...
4,mentor said they went to the movies and to the...
5,mentor said they went to missing in action. Th...
6,"Four, No b, Try something new that you have ne..."
7,mentor said they met twice where they went bow...
8,mentor said they recently met to go to the mov...
9,


In [None]:
df_train['Match ID 18Char'].unique()

array(['a1v2J0000028pRvQAI', 'a1v2J000002uR0JQAU', 'a1v2J0000027NsOQAU',
       ..., 'a1vUX000001SmPJYA0', 'a1vUX000001PyJlYAK',
       'a1vUX000001UAe5YAG'], dtype=object)

In [None]:
df_train[df_train['Match ID 18Char']=='a1v2J0000028pRvQAI'].sort_values(by='Completion Date')[['Match Support Contact Notes']]

Unnamed: 0,Match Support Contact Notes
7,team asked about activities. mentor said they ...
4,team asked about activities. mentor said they ...
3,team asked about activities. mentor said this ...
5,team asked about activities. mentor said they ...
6,"Four, No b, Try something new that you have ne..."
8,team asked about activities. mentor said they ...
1,See communication log for details on Email com...
0,"Match Closing with mentor via email Hi Mai, Th..."
2,
9,


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
df_train['Match Support Contact Notes'][3]

'mentor said this month has been busy for her; therefore, they have not gotten together. They plan to go to the Nickeleon Universe this weekend. No child safety concerns reported.   match relationship. mentor said it is good. team checked on match plan goals find opportunities to be successful- bowling activity where they both started out "rocky" but later learned how to bowl; keep trying when challenged- they didn\'t give up on the bowling experience; learn about college & careers- child has shared a couple of things she might like to do in the future when she is older, but nothing solid yet; speak up for what?s important- match talks about what they like to do and what they value as important to them; volunteer/help others- in progress, team suggested some volunteer ideas; care for neighborhood and school- mentor reported that child cares about her family, likes her friends, and enjoys school; learn a new artistic skill- in progress; explore a new form of art- in progress, team sugge

In [None]:
candidate_labels = ['Activity']
pipe(df_train['Match Support Contact Notes'][3], candidate_labels)

{'sequence': 'mentor said this month has been busy for her; therefore, they have not gotten together. They plan to go to the Nickeleon Universe this weekend. No child safety concerns reported.   match relationship. mentor said it is good. team checked on match plan goals find opportunities to be successful- bowling activity where they both started out "rocky" but later learned how to bowl; keep trying when challenged- they didn\'t give up on the bowling experience; learn about college & careers- child has shared a couple of things she might like to do in the future when she is older, but nothing solid yet; speak up for what?s important- match talks about what they like to do and what they value as important to them; volunteer/help others- in progress, team suggested some volunteer ideas; care for neighborhood and school- mentor reported that child cares about her family, likes her friends, and enjoys school; learn a new artistic skill- in progress; explore a new form of art- in progres

In [None]:
df_train['Match Support Contact Notes'][10]

'I called mentor after receiving negative and concerning feedback from parent and child. I shared a summary of the very long conversation I had with parent and child during which they repeatedly affirmed their desire to end the match as soon as possible. Their justification was complex but centered around a few things lack of a strong connection -- parent and child both expressed that the bond between child and mentor isn\'t very strong. They don\'t have a deep bond erosion of trust -- parent and child feel like they cannot count on mentor to accommodate their needs and adhere to team guidelines. They accuse her of things like inviting other people to attend meetups without informing them, not driving safely, and using guilt or shame to make child feel bad about missing meetups or not wanting to do certain activities clash of personalities -- parent and child both expressed that mentor can be controlling and judgmental. They said mentor would use guilt to manipulate child into doing ac

In [None]:
candidate_labels = ['mental health concern']
pipe(df_train['Match Support Contact Notes'][10], candidate_labels)

{'sequence': 'I called mentor after receiving negative and concerning feedback from parent and child. I shared a summary of the very long conversation I had with parent and child during which they repeatedly affirmed their desire to end the match as soon as possible. Their justification was complex but centered around a few things lack of a strong connection -- parent and child both expressed that the bond between child and mentor isn\'t very strong. They don\'t have a deep bond erosion of trust -- parent and child feel like they cannot count on mentor to accommodate their needs and adhere to team guidelines. They accuse her of things like inviting other people to attend meetups without informing them, not driving safely, and using guilt or shame to make child feel bad about missing meetups or not wanting to do certain activities clash of personalities -- parent and child both expressed that mentor can be controlling and judgmental. They said mentor would use guilt to manipulate child 

In [None]:
candidate_labels = ['mental health concern']
pipe('good driving', candidate_labels)

{'sequence': 'good driving',
 'labels': ['mental health concern'],
 'scores': [6.764986028429121e-05]}

In [None]:
df_train['Closure Details'].head(20)

Unnamed: 0,Closure Details
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
