In [47]:
import pandas as pd
import regex as re
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline, RobertaTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [21]:
df = pd.read_csv('data/all_data.csv', sep=';', index_col=0)
df.index.name = None

In [22]:
df.head()

Unnamed: 0,category,title,tags,age_rating,rating_description,description,image,duration_txt,duration_sec,first_broadcast,synopsis_small,synopsis_medium,synopsis_large
0,CBBC,Operation Ouch! - Series 9: 11. Blink and Youâ...,"BBC, iPlayer, TV, Operation Ouch!, Series 9: 1...",PG,"injury detail, rude humour, threat",Why is Dr Xand dressed as a bird in the lab? F...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,28 mins,1692,5pm 17 Mar 2021,Why is Dr Xand dressed as a bird in the lab? F...,Why is Dr Xand dressed as a bird in the lab? F...,Why is Dr Xand dressed as a bird in the lab? F...
1,CBBC,The Worst Witch - Series 4: 1. The Three Impos...,"BBC, iPlayer, TV, The Worst Witch, Series 4: 1...",PG,threat,A botched potions experiment leaves Mildred wi...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,28 mins,1693,27 Jan 2020,A botched potions experiment leaves Mildred wi...,When a botched potions experiment leaves Mildr...,No data found
2,CBBC,Shaun the Sheep - Series 4: 6. The Smelly Farmer,"BBC, iPlayer, TV, Shaun the Sheep, Series 4: 6...",U,no material likely to offend or harm,"Shaun, Bitzer and the flock decide to give the...",https://ichef.bbci.co.uk/images/ic/1200x675/p0...,7 mins,425,3:50pm 10 Feb 2014,"Shaun, Bitzer and the flock decide to give the...",Animated series. The farmer's personal hygiene...,The farmer's personal hygiene leaves much to b...
3,CBBC,PokÃ©mon: Black and White - Movies: Black - Vi...,"BBC, iPlayer, TV, PokÃ©mon: Black and White, M...",PG,contains mild violence and threat,Can Ash awaken Reshiram to help him rescue Vic...,https://ichef.bbci.co.uk/images/ic/1200x675/p0...,92 mins,5497,3 Dec 2021,Can Ash awaken Reshiram to help him rescue Vic...,Can Ash awaken legendary PokÃ©mon Reshiram to ...,"When Ash, Iris and Cilan enter a battle compet..."
4,CBBC,"Deadly 60 - Series 2: 1. Baja California, Mexico","BBC, iPlayer, TV, Deadly 60, Series 2: 1. Baja...",PG,upsetting scenes,"In Mexico, Steve Backshall takes to the water ...",https://ichef.bbci.co.uk/images/ic/1200x675/p0...,28 mins,1680,5:15pm 8 Jul 2010,"In Mexico, Steve Backshall takes to the water ...","Wildlife series. On a trip to Mexico, Steve Ba...",Steve travels to the deadliest desert on the p...


# Clean Text

In [24]:
#Function to clean-up texts
def clean_text(x):

    # remove punctuation
    x = x.translate(str.maketrans(' ', ' ', string.punctuation))

    # lowercase
    x = x.lower()

    # remove — and ’
    x = re.sub("—", " ", x)
    x = re.sub("’", " ", x)
    x = re.sub("–", " ", x)
    x = re.sub("…", " ", x)
    x = re.sub("“", " ", x)
    x = re.sub("”", " ", x)
    #x = re.sub("\\x", " ", x)

    # strip excessive whitespaces
    x = x.strip()

    # remove stopwords
    # x = [token for token in x if not token in stop_words]


    return x

In [25]:
description_cleaned = df['description'].apply(clean_text)
synopsis_small_cleaned = df['synopsis_small'].apply(clean_text)
print(description_cleaned.equals(synopsis_small_cleaned))

True


# Removing columns

In [26]:
df['synopsis'] = np.where(df['synopsis_large'] != 'No data found', df['synopsis_large'],
                                   np.where(df['synopsis_medium'] != 'No data found', df['synopsis_medium'],
                                            df['synopsis_small']))

In [27]:
df.drop(columns=['synopsis_large', 'synopsis_medium'], inplace=True)

# Child Appropriateness Score

This section prepares the dataset for the metrics that are necessary for the Child Appropriateness Score

## Sentiment Analysis

The sentiment_analysis method returns the label and the confidence/accuracy of the classification.
    If the clasification is positive, the method will simply return the accuracy score. If it is negative, it will 
    return the negative of the score.

In [28]:
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

In [44]:
def analyse_sentiment(text):
    output = sentiment_analysis(text)
    score = round(output[0]['score'], 4) if output else None #round the sentiment score to 4 decimal places

    if output[0]['label'] == 'NEGATIVE':
        return [-score]
    return [score]

## Term Frequency

Specific content within the synopsis that could be potentially inappropriate for children, such as violence, sexual content, drug references, profanity, etc. is searched for and assessed through a term frequency matrix, indicating the portion of the unsuitable terms present within the description. The same terms will be used for all age groups but the final weighting will be different.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords_and_lemmatize(x):
    tokens = word_tokenize(x)
    
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    
    result = []
    for i, token in enumerate(tokens):
        if i > 0 and all(char in string.punctuation for char in token):
            result[-1] += token
        else:
            result.append(token)
    
    return ' '.join(result)

df['synopsis_nostopwords'] = df['synopsis'].apply(remove_stopwords_and_lemmatize)

In [None]:
violence_terms = ['blood', 'gore', 'kill', 'murder', 'torture', 'assualt, weapon', 'war']
sexual_terms = ['sex', 'pornography', 'nude', 'erotic']
drug_terms = ['cocaine', 'heroin', 'weed', 'alcohol', 'drug']
scary_terms = ['horror', 'terror', 'panic', 'frighten', 'violence']
mature_terms = ['suicide', 'depression', 'mentall illness', 'abuse', 'death']
language_terms = ['language', 'swear', 'rude']
unsuitable_terms = violence_terms + sexual_terms + drug_terms + scary_terms + mature_terms + language_terms

In [None]:
"""
    This method counts each unsuitable word found in a sentence and returns the negated fraction of the unsuitable words over
    the total number of words in the sentence.
"""
def vectorize_row(text, unsuitable_terms):
    words = text.split()  
    unsuitable_terms_count = 0
    
    for word in words:
        if word.lower() in unsuitable_terms:
            unsuitable_terms_count += 1
    
    total_terms = len(words)
    
    return -(unsuitable_terms_count / total_terms)

In [None]:
df['unsuitable_ratio'] = df.apply(lambda row: vectorize_row(row['synopsis_nostopwords'], unsuitable_terms), axis=1)
df['unsuitable_ratio2'] = df.apply(lambda row: vectorize_row(row['rating_description'], unsuitable_terms), axis=1)

## Age Rating per Movie

Currently, the age rating has no numeric value. Now, a score will be assigned based on the rating. The numerical value represents the proportion of children aged between 4 and 17 that can safely view the content. 

In [None]:
rating_mapping = {'U': 1, 'PG': 0.71, '12': 0.57, '12A': 0.57, '15':0.21, '18': 0.0}
def age_rating_weighting(rating):
    return rating_mapping(rating, 0.0)

df['age_rating_numeric'] = df['age_rating'].map(rating_mapping)

## Child Appropriateness Score

$sentiment$ is the Sentiment Analysis score for the synopsis <br>
$freq1$ is the proportion of the synopsis that contains any of the unsuitable terms <br>
$freq2$ is the proportion of the age description that contains any unsuitable terms <br>
$rating$ is the score for the Age Rating <br>
$rating\_desc\_score$ is the Sentiment Anlaysis score for the rating description <br>

$s$: is the weight for the Sentiment Analysis score <br>
$t1$: is the weight for the Term Frequency score for the synopses <br>
$t2$: is the weight for the Term Frequency score for the age description<br>
$r1$: is the weight for the Age Rating score <br>
$r2$: is the weight for the Age Rating Description score <br>

In [None]:
def child_appropriateness_score(sentiment, freq1, freq2, rating, rating_desc_score, s, t1, t2, r1, r2):
    return s*sentiment + t1*freq1 + t2*freq2 + r1*rating + r2*rating_desc_score

In [None]:
# the weighting for each age group
w48 = [0.2, 0.15, 0.15, 0.3, 0.2]
w911 = [0.20, 0.125, 0.125, 0.30, 0.25]
w1214 = [0.20, 0.10, 0.10, 0.25, 0.35]
w1517 = [0.15, 0.075, 0.075, 0.20, 0.50]

In [None]:
df['cas48'] = df.apply(
    lambda row: child_appropriateness_score(row['sentiment'], row['unsuitable_ratio'], row['unsuitable_ratio2'],row['age_rating_numeric'], 
                                            row['rating_description_score'], w48[0], w48[1], w48[2], w48[3], w48[4]), axis=1)

df['cas911'] = df.apply(
    lambda row: child_appropriateness_score(row['sentiment'], row['unsuitable_ratio'], row['unsuitable_ratio2'],row['age_rating_numeric'],
                                            row['rating_description_score'], w911[0], w911[1], w911[2], w911[3], w911[4]), axis=1)

df['cas1214'] = df.apply(
    lambda row: child_appropriateness_score(row['sentiment'], row['unsuitable_ratio'], row['unsuitable_ratio2'], row['age_rating_numeric'],
                                            row['rating_description_score'], w1214[0], w1214[1], w1214[2], w1214[3], w1214[4]), axis=1)

df['cas1517'] = df.apply(
    lambda row: child_appropriateness_score(row['sentiment'], row['unsuitable_ratio'], row['unsuitable_ratio2'],row['age_rating_numeric'],
                                           row['rating_description_score'], w1517[0], w1517[1], w1517[2], w1517[3], w1517[4]), axis=1)

In [None]:
# recommendations for 4-6
recommendations4_8 = df[['category', 'title', 'tags', 'age_rating', 'image','synopsis_small', 'cas48']].sort_values(by='cas48', ascending=False)
filtered4_8 = recommendations4_8[recommendations4_8['cas48'] >= 0.6]
filtered4_8.to_csv('../data/recommendations/ages4_8.csv')

# recommendations for 9-11 and under
recommendations9_11 = df[['category', 'title', 'tags', 'age_rating', 'image','synopsis_small', 'cas911']].sort_values(by='cas911', ascending=False)
filtered9_11 = recommendations9_11[recommendations9_11['cas911'] >= 0.6]
filtered9_11.to_csv('../data/recommendations/ages9_11.csv')

# recommendations for 12-14
recommendations12_14 = df[['category', 'title', 'tags', 'age_rating', 'image','synopsis_small', 'cas1214']].sort_values(by='cas1214', ascending=False)
filtered12_14 = recommendations12_14[recommendations12_14['cas1214'] >= 0.6]
filtered12_14.to_csv('../data/recommendations/ages12_14.csv')

# recommendations for 15-17
recommendations15_17 = df[['category', 'title', 'tags', 'age_rating', 'image','synopsis_small', 'cas1517']].sort_values(by='cas1517', ascending=False)
filtered15_17 = recommendations15_17[recommendations15_17['cas1517'] >= 0.6]
filtered15_17.to_csv('../data/recommendations/ages15_17.csv')