In [102]:
import re
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import joblib

nltk_stopwords = set(stopwords.words('english'))

emotional_words = [
    'ecstatic', 'elated', 'blissful', 'jubilant', 'cheerful', 'thrilled', 'delight', 'exhilarated',
    'content', 'grateful', 'affection', 'adoration', 'devotion', 'tenderness', 'caring', 'compassionate',
    'empathy', 'cherish', 'admire', 'fondness', 'hopeful', 'confident', 'optimistic', 'encouraged',
    'reassured', 'inspired', 'ambitious', 'enthusiastic', 'dreamy', 'faithful',
    'terrified', 'horrified', 'anxious', 'panicked', 'frightened', 'apprehensive', 'worried', 'nervous',
    'alarmed', 'dread', 'furious', 'outraged', 'enraged', 'livid', 'infuriated', 'annoyed', 'frustrated',
    'irritated', 'exasperated', 'agitated', 'heartbroken', 'devastated', 'mournful', 'sorrowful',
    'melancholy', 'dejected', 'despair', 'loneliness', 'hopeless', 'disheartened',
    'amazed', 'stunned', 'shocked', 'astounded', 'astonished', 'startled', 'dumbfounded', 'speechless',
    'perplexed', 'bewildered',
    'disgusted', 'repulsed', 'nauseated', 'revolted', 'loathing', 'detest', 'despise', 'scorn',
    'contemptuous', 'abhor',
    'ashamed', 'embarrassed', 'guilty', 'humiliated', 'regretful', 'remorseful', 'mortified', 
    'self-conscious', 'apologetic', 'disgraced'
]

def calculate_phrase_mean_length(texts):
    phrase_mean_length = []

    for t in texts:
        # Split text into phrases (by period)
        phrases = [phrase.strip() for phrase in t.split('.')]  

        # Get the length of each phrase in terms of word count
        phrase_lengths = [len(phrase.split()) for phrase in phrases if phrase]  

        # Calculate the mean length of the phrases
        if phrase_lengths:  
            mean_length = sum(phrase_lengths) / len(phrase_lengths)
        else:
            mean_length = 0

        phrase_mean_length.append(mean_length)
    
    return phrase_mean_length

def count_numbers_in_texts(texts):
    number_counts = []

    for text in texts:
        # Use regular expression to find all sequences of digits
        numbers = re.findall(r'\d+', text)
        
        # Count the number of sequences found
        number_count = len(numbers)
        
        number_counts.append(number_count)
    
    return number_counts

def count_proper_nouns(texts):
    proper_noun_counts = []

    for text in texts:
        # Split text into sentences using punctuation as delimiters
        sentences = re.split(r'[.!?]\s+', text)

        proper_noun_count = 0

        # For each sentence, find capitalized words that are NOT at the start
        for sentence in sentences:
            # Split sentence into words
            words = sentence.split()
            
            # Exclude the first word in the sentence and count proper nouns
            proper_nouns = [word for word in words[1:] if re.match(r'\b[A-Z][a-z]*\b', word)]
            
            proper_noun_count += len(proper_nouns)
        
        proper_noun_counts.append(proper_noun_count)
    
    return proper_noun_counts

def count_punctuation_signals(texts):
    punctuation_counts = []

    # Define a regular expression for common punctuation marks
    punctuation_pattern = r'[.,!?;:()\[\]\'\"-]'

    for text in texts:
        # Find all punctuation marks in the text
        punctuation_signals = re.findall(punctuation_pattern, text)
        
        # Count the number of punctuation marks
        punctuation_count = len(punctuation_signals)
        
        punctuation_counts.append(punctuation_count)
    
    return punctuation_counts

def count_stopwords(texts):
    stopword_counts = []

    for text in texts:
        # Tokenize the text into words, lowercased
        words = re.findall(r'\b\w+\b', text.lower())
        
        # Count the number of stopwords
        stopword_count = sum(1 for word in words if word in nltk_stopwords)
        
        stopword_counts.append(stopword_count)
    
    return stopword_counts

def count_emotional_words(text):
    # Ensure text is a string and normalize: remove punctuation and convert to lowercase
    if not isinstance(text, str):
        return 0
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Count how many emotional words are in the text
    emotional_word_count = sum(1 for word in words if word in emotional_words)
    
    return emotional_word_count

def count_repeated_non_stopwords(text, threshold=10):
    # Load English stopwords from NLTK
    stop_words = set(stopwords.words('english'))
    
    # Normalize the text: remove punctuation and convert to lowercase
    words = re.findall(r'\b\w+\b', text.lower())
    
    # Filter out stopwords
    non_stopwords = [word for word in words if word not in stop_words]
    
    # Count the frequency of each non-stopword
    word_counts = Counter(non_stopwords)
    
    # Filter words repeated more than 'threshold' times
    repeated_words = [word for word, count in word_counts.items() if count > threshold]
    
    # Return the count of non-stopwords repeated more than 'threshold' times
    return len(repeated_words)

def count_capital_letters(text):
    # Count the number of uppercase letters in the text
    return sum(1 for char in text if char.isupper())

def count_references(text):
    # Define common patterns for references:
    reference_patterns = [
        r'\[\d+\]',                          # Numeric references like [1], [23]
        r'\(\d+\)',                          # Numeric references like (1), (23)
        r'\([A-Za-z]+, \d{4}\)',             # Parenthetical academic references like (Smith, 2020)
        r'\b\d{4}\b',                        # Standalone years like 2020, 2019
        r'\bdoi:?\s?10\.\d{4,9}/[-._;()/:A-Z0-9]+\b',  # DOI references (e.g., doi:10.1234/abcd.5678)
        r'(https?://[^\s]+)',                # URLs (e.g., http://example.com or https://doi.org)
        r'\bISBN[-\s]?(?:\d{9}[\dXx]|\d{13})\b',  # ISBN numbers (ISBN-10 or ISBN-13 format)
        r'\^[1-9]\d*',                       # Footnotes like ^1, ^2, etc.
        r'[A-Za-z]+ et al\., \d{4}',         # Common academic style, e.g., Smith et al., 2020
        r'\([A-Za-z]+ et al\., \d{4}\)'      # Full academic reference style, e.g., (Smith et al., 2020)
    ]
    
    # Combine all patterns into a single regex pattern
    combined_pattern = '|'.join(reference_patterns)
    
    # Find all matches in the text
    references = re.findall(combined_pattern, text)
    
    return len(references)

def mean_word_length(text):
    # Normalize the text by removing punctuation and extracting words
    words = re.findall(r'\b\w+\b', text)
    
    # Calculate the lengths of the words
    word_lengths = [len(word) for word in words]
    
    # Calculate the mean (average) word length
    if word_lengths:  # Ensure there are words to avoid division by zero
        mean_length = sum(word_lengths) / len(word_lengths)
    else:
        mean_length = 0
    
    return mean_length


def get_features(title, text):
    columns = np.zeros(13)
    #text title length
    columns[0] = len(text)
    columns[1] = len(title)
    # Length of phrases (mean length)
    columns[2] = np.mean(calculate_phrase_mean_length([text]))  # Calculate mean for the text
    columns[3] = np.mean(calculate_phrase_mean_length([title]))  # Calculate mean for the title
    # Quantity of numbers in the text
    columns[4] = count_numbers_in_texts([text])[0]  # Call function and take the first item
    # Quantity of proper nouns in the text
    columns[5] = count_proper_nouns([text])[0]  # Call function and take the first item
    # Quantity of punctuation signals in the text
    columns[6] = count_punctuation_signals([text])[0]  # Call function and take the first item
    # Quantity of stopwords in the text
    columns[7] = count_stopwords([text])[0]  # Call function and take the first item
    #emotional words
    columns[8] = count_emotional_words(text)
    #repeated non stopwords
    columns[9] = count_repeated_non_stopwords(text, threshold=10)
    #capital letters title
    columns[10] = count_capital_letters(title)
    #references
    columns[11] = count_references(text)
    #word length
    columns[12] = mean_word_length(text)

    # get dataframe
    headers= ['text_length', 'title_length', 'length_phrases_no_punctuation_text',
       'length_phrases_no_punctuation_title', 'qtt_numbers', 'qtt_noms_propis',
       'qtt_punt', 'stopwords', 'emotional', 'repeated', 'capital_title',
       'reference_count', 'word_length']

    # Create a DataFrame with shape (1, 13)
    df = pd.DataFrame(columns.reshape(1, -1), columns=headers)
    
    return df


df = get_features('e','fake new')

# Scale the data
robust_scaler = joblib.load('robust_scaler.pkl') 

scaled_input = robust_scaler.transform(df)

#load the trained model
model = joblib.load('final_logistic_regression_model.pkl')

model.predict(scaled_input)

array([1])

In [94]:
df = get_features('deded', 'e')
df.shape

(1, 13)

In [96]:
# Scale the data
robust_scaler = joblib.load('robust_scaler.pkl') 

scaled_input = robust_scaler.transform(df)

In [98]:
#load the trained model
model = joblib.load('final_logistic_regression_model.pkl')

model.predict(scaled_input)

array([1])

# FastAPI deployment

we will do that in an `app.py` file instead of a jupyter notebook