### Importing libraries and donwloading resources

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from textblob import TextBlob
from textstat import flesch_reading_ease
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy
import string

nlp = spacy.load("en_core_web_sm")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger", force=True)
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\MALHAR
[nltk_data]     PRAJAPATI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MALHAR PRAJAPATI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to C:\Users\MALHAR
[nltk_data]     PRAJAPATI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

___
#### Lexical diversity 


Lexical diversity assesses the variety of unique words in a text, indicating vocabulary richness. A higher score suggests diverse content, while a lower score implies repetition. This metric helps analyze writing styles and enhances NLP model performance.  

**Formula:**  
$$
\text{Lexical Diversity} = \frac{\text{Unique Words}}{\text{Total Words}}
$$  

In [2]:
def lexical_diversity(text):
    """
    Computes the lexical diversity of a given text.

    Lexical diversity is the ratio of unique words to total words, indicating vocabulary richness.
    Higher values suggest diverse writing, while lower values indicate repetition.

    Parameters:
    -----------
    text : str
        Input text for lexical diversity calculation.

    Returns:
    --------
    float
        Lexical diversity score (Unique Words / Total Words). Returns 0 for empty text.
    """
    words = word_tokenize(text)
    return len(set(words)) / len(words) if len(words) > 0 else 0


In [3]:
print(lexical_diversity.__doc__)


    Computes the lexical diversity of a given text.

    Lexical diversity is the ratio of unique words to total words, indicating vocabulary richness.
    Higher values suggest diverse writing, while lower values indicate repetition.

    Parameters:
    -----------
    text : str
        Input text for lexical diversity calculation.

    Returns:
    --------
    float
        Lexical diversity score (Unique Words / Total Words). Returns 0 for empty text.
    


>Example of how the doc will look like to any other person using this function

___

#### Punctuation count

Punctuation count helps analyze writing style by measuring the frequency of punctuation marks in a text. Higher punctuation usage can indicate expressive or complex writing, while lower usage suggests simpler or more straightforward text. This feature enhances NLP models by capturing tone, structure, and readability variations.

In [4]:
def punctuation_count(text):
    """
    Computes the number of punctuation characters in a given text.

    This function counts the occurrences of punctuation marks in the input text.
    It helps in analyzing text structure and writing style.

    Parameters:
    -----------
    text : str
        Input text from which punctuation characters are counted.

    Returns:
    --------
    int
        Total count of punctuation characters in the text.
    """
    return sum(1 for char in text if char in string.punctuation)


___
#### Average word length

Average word length helps analyze writing complexity by measuring the typical length of words in a text. Longer words indicate formal or technical writing, while shorter words suggest simplicity. This feature improves NLP models by capturing variations in writing styles.  

**Formula:**  
$$
\text{Average Word Length} = \frac{\sum \text{Length of Each Word}}{\text{Total Words}}
$$  


In [5]:
def average_word_length(text):
    """
    Computes the average word length in a given text.

    This function calculates the mean length of words by dividing the total number 
    of characters in all words by the total word count. It provides insight into 
    text complexity, readability, and writing style.

    Parameters:
    -----------
    text : str
        Input text for analysis.

    Returns:
    --------
    float
        The average word length. Returns 0 if the text is empty.
    """
    words = word_tokenize(text)
    return sum(len(word) for word in words) / len(words) if words else 0

___
#### NER(Named Entity Recognition) Count

Named entity count identifies the number of proper nouns (e.g., names, places, organizations) in a text. It helps analyze text structure and detect formal or factual content, aiding in fake review detection and sentiment analysis.  

**Formula:**  
$$
\text{Named Entity Count} = \text{Number of Proper Nouns Detected}
$$  


In [6]:
def named_entity_count(text):
    """
    Counts the number of named entities in a given text.

    This function estimates the count of named entities (e.g., proper nouns) 
    by identifying words that start with an uppercase letter followed by lowercase letters. 
    While this is a simplistic approach, it can provide a rough measure of entity presence 
    in the text.

    Parameters:
    -----------
    text : str
        Input text for analysis.

    Returns:
    --------
    int
        The number of detected named entities.
    """
    doc = nlp(text)  
    return len(doc.ents)


___
#### POS(Part Of Speech) Tagging

This function counts words in a text that belong to a specific part-of-speech (POS) category. It helps analyze sentence structure and linguistic patterns, aiding in text classification and sentiment analysis.  

**Formula:**  
$$
\text{POS Tag Count} = \sum \mathbf{1} \quad \text{if tag starts with specified prefix}
$$  

In [7]:
def count_pos_tags(text, tag_prefix):
    """
    Counts the number of words in a text that belong to a specified part-of-speech (POS) category.

    This function tokenizes the input text, assigns POS tags using NLTK's `pos_tag` function, 
    and counts how many words have tags that start with the given prefix. POS prefixes include:
    - "NN" for nouns
    - "VB" for verbs
    - "JJ" for adjectives
    - "RB" for adverbs

    Parameters:
    -----------
    text : str
        The input text to be analyzed.
    tag_prefix : str
        The prefix of the POS tag to count (e.g., "NN" for nouns, "VB" for verbs).

    Returns:
    --------
    int
        The count of words matching the specified POS tag prefix.
    """
    tokens = nltk.pos_tag(word_tokenize(text))
    return sum(1 for _, tag in tokens if tag.startswith(tag_prefix))


___
#### Sentiment Polarity

Sentiment polarity determines the emotional tone of a text, indicating whether it is positive, negative, or neutral. This feature is useful in sentiment analysis, fake review detection, and opinion mining.  

**Formula:**  
$$
\text{Sentiment Polarity} \in [-1, 1]
$$  
- 1 → Positive sentiment  
- 0 → Neutral sentiment  
- -1 → Negative sentiment  


In [8]:
def sentiment_polarity(text):
    """
    Computes the sentiment polarity of a given text.

    Sentiment polarity measures the emotional tone of a text, 
    ranging from -1 (negative sentiment) to +1 (positive sentiment), 
    with 0 indicating a neutral sentiment. This function utilizes 
    TextBlob's sentiment analysis to evaluate the polarity.

    Parameters:
    -----------
    text : str
        The input text for sentiment analysis.

    Returns:
    --------
    float
        A polarity score between -1 and 1, where:
        - Negative values indicate negative sentiment.
        - Positive values indicate positive sentiment.
        - A value close to 0 suggests neutrality.
    """
    return TextBlob(text).sentiment.polarity


___
#### Subjectivity Score

Subjectivity measures the degree to which a text expresses personal opinions rather than factual statements. It is useful in sentiment analysis, fake review detection, and opinion mining.  

**Formula:**  
$$
\text{Subjectivity Score} \in [0, 1]
$$  
- **1** → Highly subjective (opinion-based)  
- **0** → Highly objective (fact-based)  

In [9]:
def subjectivity_score(text):
    """
    Computes the subjectivity score of a given text.

    Subjectivity measures the degree of personal opinion, emotion, 
    or bias in a text. The score ranges from 0 to 1, where:
    - 0 indicates an objective statement (fact-based content).
    - 1 indicates a highly subjective statement (opinion-based content).

    This function utilizes TextBlob's sentiment analysis to evaluate 
    the subjectivity of the text.

    Parameters:
    -----------
    text : str
        The input text for subjectivity analysis.

    Returns:
    --------
    float
        A subjectivity score between 0 and 1, where:
        - Scores closer to 0 suggest objective language.
        - Scores closer to 1 suggest subjective language.
    """
    return TextBlob(text).sentiment.subjectivity


___
#### Flesch Reading Ease Score

The Flesch Reading Ease score assesses the readability of a text. Higher scores indicate simpler text, while lower scores suggest more complex writing. It helps in detecting fake reviews by identifying unnatural writing patterns.

**Formula:**  
$$
\text{FRE} = 206.835 - (1.015 \times \frac{\text{Total Words}}{\text{Total Sentences}}) - (84.6 \times \frac{\text{Total Syllables}}{\text{Total Words}})
$$  


In [10]:
def flesch_reading_ease_score(text):
    """
    Computes the Flesch Reading Ease score for a given text.

    The Flesch Reading Ease score evaluates text readability based on 
    sentence length and word complexity. The score typically ranges from 0 to 100, where:
    
    - 90–100: Very easy to read (understandable by 5th graders).
    - 60–70: Standard readability (understandable by 8th-9th graders).
    - 0–30: Very difficult to read (best suited for academic or technical texts).

    A higher score indicates better readability.

    Parameters:
    -----------
    text : str
        The input text whose readability score is to be calculated.

    Returns:
    --------
    float
        The Flesch Reading Ease score, or 0 if the input is not a valid string.
    """
    return flesch_reading_ease(text) if isinstance(text, str) else 0


___
#### Sentence Length

Sentence length represents the total number of words in a text. It helps analyze writing style, as fake reviews may have distinct sentence structures compared to genuine ones. This metric is useful for linguistic pattern analysis.

**Formula:**  
$$
\text{Sentence Length} = \text{Number of Words in a Sentence}
$$  


In [11]:
def sentence_length(text):
    """
    Computes the number of words in a given text.

    This function tokenizes the input text and counts the number of words.
    It helps in analyzing sentence complexity and structure, which can be
    useful in readability assessments and linguistic studies.

    Parameters:
    -----------
    text : str
        The input text for which the word count is to be determined.

    Returns:
    --------
    int
        The total number of words in the text.
    """
    return len(word_tokenize(text))


___

#### Combinig all things together

In [None]:
import pandas as pd
from textblob import TextBlob
from nltk.tokenize import word_tokenize

def feature_engineering(df, filename):
    print(f"Extracting features for {filename}...")
    # Ensure that 'processed_text' is a string and fill missing values with an empty string
    df["processed_text"] = df["processed_text"].fillna("").astype(str)
    
    # print(df['processed_text'].head())
    
    df["lexical_diversity"] = df["processed_text"].apply(lexical_diversity)
    df["avg_word_length"] = df["processed_text"].apply(average_word_length)
    df["sentiment_polarity"] = df["processed_text"].apply(sentiment_polarity)
    df["subjectivity"] = df["processed_text"].apply(subjectivity_score)
    df["flesch_reading_ease"] = df["processed_text"].apply(flesch_reading_ease_score)
    df["sentence_length"] = df["processed_text"].apply(sentence_length)
    df["named_entity_count"] = df["processed_text"].apply(named_entity_count)
    df["punctuation_count"] = df["processed_text"].apply(punctuation_count)
    df["noun_count"] = df["processed_text"].apply(lambda x: count_pos_tags(x, "NN"))
    df["verb_count"] = df["processed_text"].apply(lambda x: count_pos_tags(x, "VB"))
    df["adj_count"] = df["processed_text"].apply(lambda x: count_pos_tags(x, "JJ"))
    df["adv_count"] = df["processed_text"].apply(lambda x: count_pos_tags(x, "RB"))
    
    filepath = f"../Data/Feature-Engineered/{filename}_features.csv"
    df.to_csv(filepath, index=False)
    print(f"Features extracted & saved: {filepath}")
    
    return df

datasets = [
    "preprocessed_lemmatization",
    "preprocessed_no_stopwords",
    "preprocessed_stemming",
    "preprocessed_stemming_no_stopwords",
    "preprocessed_no_stopwords_no_lemmatization",
]

for dataset in datasets:
    df = pd.read_csv(f"../Data/Pre-processed/{dataset}.csv")
    feature_engineering(df, dataset)

print("All feature-engineered datasets saved in '../Data/Feature-Engineered/'")


Extracting features for preprocessed_lemmatization...
Features extracted & saved: ../Data/Feature-Engineered/preprocessed_lemmatization_features.csv
Extracting features for preprocessed_no_stopwords...
Features extracted & saved: ../Data/Feature-Engineered/preprocessed_no_stopwords_features.csv
Extracting features for preprocessed_stemming...
Features extracted & saved: ../Data/Feature-Engineered/preprocessed_stemming_features.csv
Extracting features for preprocessed_stemming_no_stopwords...
Features extracted & saved: ../Data/Feature-Engineered/preprocessed_stemming_no_stopwords_features.csv
Extracting features for preprocessed_no_stopwords_no_lemmatization...
Features extracted & saved: ../Data/Feature-Engineered/preprocessed_no_stopwords_no_lemmatization_features.csv
All feature-engineered datasets saved in '../Data/Feature-Engineered/'
