<a href="https://colab.research.google.com/github/mnshakoor/ACE_Framework/blob/main/ARAC_ContentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Define the path to your file in Google Drive
# Replace "Colab Files/my_analysis_file.pdf" with the path to your actual file
file_path = "/content/drive/MyDrive/Colab/Conflict_In_Cameroon.pdf"

# Step 1: Download required NLTK resources for TextBlob
import nltk

# Download all necessary data for TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('omw-1.4')
nltk.download('universal_tagset')

# Download the 'punkt_tab' resource
nltk.download('punkt_tab') # This line was added to download the missing resource

# Specifically download the 'averaged_perceptron_tagger_eng' resource
nltk.download('averaged_perceptron_tagger_eng') # Download the missing resource


# Step 2: Proceed with other imports and script setup
import spacy
from textblob import TextBlob
from transformers import pipeline
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import defaultdict
import fitz  # PyMuPDF for PDFs
from docx import Document  # For Word files
import json

# Load spaCy English model with dependency parsing and NER capabilities
nlp = spacy.load("en_core_web_sm")

# Initialize transformers sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis")

# Function to read text from different file formats
def read_text_from_file(file_path):
    if file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    elif file_path.endswith('.pdf'):
        text = ""
        with fitz.open(file_path) as pdf:
            for page in pdf:
                text += page.get_text()
    elif file_path.endswith('.docx'):
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
    else:
        raise ValueError("Unsupported file format. Please use .txt, .pdf, or .docx files.")

    return text

# Function to perform content analysis
def content_analysis(text):
    doc = nlp(text)
    words = [token.text for token in doc if token.is_alpha]
    word_freq = pd.Series(words).value_counts()
    blob = TextBlob(text)
    emotional_words = [word for word, pos in blob.tags if pos in ['JJ', 'RB']]
    return word_freq.head(10).to_dict(), emotional_words

# ... (previous code) ...

# Function to perform sentiment analysis with truncation
def sentiment_analysis_func(text, max_length=512):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    # Truncate the text for transformer sentiment analysis
    truncated_text = text[:max_length]  # Truncate to the maximum allowed length
    transformer_sentiment = sentiment_analysis(truncated_text)
    return polarity, subjectivity, transformer_sentiment

# ... (rest of the code) ...

# Function to perform topic modeling
def topic_modeling(text):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=1, random_state=42)
    lda.fit(X)
    topic_words = {f"Topic {i}": [vectorizer.get_feature_names_out()[j] for j in topic.argsort()[:-11:-1]]
                   for i, topic in enumerate(lda.components_)}
    return topic_words

# Function to perform Named Entity Recognition (NER)
def named_entity_recognition(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

# Function for dependency parsing to detect metaphors and framing structures
def dependency_parsing(text):
    doc = nlp(text)
    metaphors = []
    frames = []
    for sent in doc.sents:
        if " like " in sent.text or " as " in sent.text:
            metaphors.append(sent.text)
        for token in sent:
            if token.lemma_ in ["freedom", "control"]:
                frames.append(sent.text)
                break
    return metaphors, frames

# Function to save results to files
def save_results_to_files(results, output_prefix="analysis_results"):
    # Save as a text file
    with open(f"{output_prefix}.txt", "w", encoding="utf-8") as file:
        for section, content in results.items():
            file.write(f"=== {section} ===\n")
            file.write(f"{content}\n\n")

    # Save as a CSV file (for tabular data like word frequencies and entities)
    pd.DataFrame.from_dict(results["Word Frequency"], orient='index', columns=["Frequency"]).to_csv(f"{output_prefix}_word_frequency.csv")
    pd.DataFrame.from_dict(results["Named Entities"], orient='index').to_csv(f"{output_prefix}_entities.csv")

    # Save as a JSON file for structured data
    with open(f"{output_prefix}.json", "w", encoding="utf-8") as file:
        json.dump(results, file, indent=4)

# Running all analyses
def run_cognitive_linguistic_analysis(file_path):
    # Read text from file
    text = read_text_from_file(file_path)

    # Perform analyses
    word_freq, emotional_words = content_analysis(text)
    polarity, subjectivity, transformer_sentiment = sentiment_analysis_func(text)
    topics = topic_modeling(text)
    entities = named_entity_recognition(text)
    metaphors, frames = dependency_parsing(text)

    # Compile results into a dictionary
    results = {
        "Word Frequency": word_freq,
        "Emotional Words": emotional_words,
        "Polarity": polarity,
        "Subjectivity": subjectivity,
        "Transformer Sentiment": transformer_sentiment,
        "Topics": topics,
        "Named Entities": entities,
        "Metaphors": metaphors,
        "Frames": frames
    }

    # Save results to files
    save_results_to_files(results)
    print("Results saved to files.")

# Specify the path to your file (PDF, DOCX, or TXT)
file_path = "/content/drive/MyDrive/Colab/Conflict_In_Cameroon.pdf"  # Replace with the path to your file

# Run the analysis on the file
run_cognitive_linguistic_analysis(file_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_d

Results saved to files.


In [19]:
!pip install -U textblob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


In [20]:
# Define the path to your file in Google Drive
# Replace "Colab Files/my_analysis_file.pdf" with the path to your actual file
file_path = "/content/drive/MyDrive/Colab/Conflict_In_Cameroon.pdf"

# Step 1: Download required NLTK resources for TextBlob
import nltk


# Download all necessary data for TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('brown')
nltk.download('omw-1.4')
nltk.download('universal_tagset')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab') # This line was added to download the missing resource


# Step 2: Proceed with other imports and script setup
import spacy
from textblob import TextBlob
from transformers import pipeline
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import defaultdict
import fitz  # PyMuPDF for PDFs
from docx import Document  # For Word files
import json

# ... (rest of the code remains the same)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
python -m textblob.download_corpora

SyntaxError: invalid syntax (<ipython-input-13-bec4ba3f7ac1>, line 1)

In [17]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [14]:
!pip uninstall -y textblob
!pip install textblob



Found existing installation: textblob 0.17.1
Uninstalling textblob-0.17.1:
  Successfully uninstalled textblob-0.17.1
Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [3]:
import spacy
spacy.cli.download("en_core_web_sm")


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
