## 3

In [7]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import string

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

def preprocess_review(text):
    """
    Preprocesses a given text by tokenizing, converting to lowercase,
    removing punctuation, and filtering out stopwords.
    """
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lower case and remove punctuation
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

def load_and_preprocess_reviews(directory):
    """
    Loads and preprocesses all review files in a specified directory.
    """
    reviews = []
    file_names = os.listdir(directory)
    for file_name in file_names:
        file_path = os.path.join(directory, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            preprocessed_content = preprocess_review(content)
            reviews.append(preprocessed_content)
    return reviews

def create_term_document_matrix(reviews):
    """
    Creates a term-document matrix from preprocessed reviews.
    """
    term_doc_dict = {}
    
    for review, label in reviews:
        word_count = {}
        for word in review:
            if word not in word_count:
                word_count[word] = 1
            else:
                word_count[word] += 1
        
        for word, count in word_count.items():
            if word not in term_doc_dict:
                term_doc_dict[word] = [count if label == lbl else 0 for lbl in ['POS', 'NEG']]
            else:
                term_doc_dict[word][0 if label == 'POS' else 1] += count
    
    term_document_matrix = pd.DataFrame.from_dict(term_doc_dict, orient='index', columns=['POS', 'NEG'])
    return term_document_matrix

# Paths to the directories containing positive and negative reviews
pos_directory = r"C:\Users\navna\Downloads\AIM-5001-main\pos"
neg_directory = r"C:\Users\navna\Downloads\AIM-5001-main\neg"

# Load and preprocess all reviews
pos_reviews = load_and_preprocess_reviews(pos_directory)
neg_reviews = load_and_preprocess_reviews(neg_directory)
all_reviews = [(review, 'POS') for review in pos_reviews] + [(review, 'NEG') for review in neg_reviews]

# Create the term-document matrix
term_document_matrix = create_term_document_matrix(all_reviews)

# Display part of the matrix
term_document_matrix.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,POS,NEG
films,877,643
adapted,28,17
comic,209,166
books,49,27
plenty,73,58


## 4

In [11]:
import numpy as np

# Assuming 'term_document_matrix' is your DataFrame
numpy_array = term_document_matrix.values
numpy_array


array([[877, 643],
       [ 28,  17],
       [209, 166],
       ...,
       [  0,   1],
       [  0,   1],
       [  0,   1]], dtype=int64)

In [12]:
# More explicit method to convert DataFrame to NumPy array
numpy_array = term_document_matrix.to_numpy()
numpy_array


array([[877, 643],
       [ 28,  17],
       [209, 166],
       ...,
       [  0,   1],
       [  0,   1],
       [  0,   1]], dtype=int64)

In [13]:
# Continue to work with 'numpy_array' for any computational needs
print("Shape of the array:", numpy_array.shape)

# Example: Summing up all entries (to get total counts of all words in all reviews)
total_count = numpy_array.sum()
print("Total count of all words in all reviews:", total_count)


Shape of the array: (37964, 2)
Total count of all words in all reviews: 678727


## 5

In [14]:
# Assuming you have converted your DataFrame to a NumPy array already
numpy_array = term_document_matrix.to_numpy()

# Count the number of zeroes
zero_count = np.count_nonzero(numpy_array == 0)

# Total number of entries
total_entries = numpy_array.size

# Calculate the sparsity percentage
sparsity_percentage = (zero_count / total_entries) * 100

print(f"Sparsity of the matrix: {sparsity_percentage:.2f}%")

Sparsity of the matrix: 25.88%


In [None]:
# import zipfile
# import os

# # Path to the ZIP file
# zip_path = '/mnt/data/AIM-5001-main (2).zip'
# extract_folder = '/mnt/data/movie_reviews'

# # Extract the ZIP file
# with zipfile.ZipFile(zip_path, 'r') as zip_ref:
#     zip_ref.extractall(extract_folder)

# # List the contents of the extracted directory to understand its structure
# os.listdir(extract_folder)


In [None]:
# # Checking the further contents within the extracted main directory to find the review directories
# os.listdir(f"{extract_folder}/AIM-5001-main")


In [None]:
# # Function to read reviews from a directory and preprocess them
# def load_and_preprocess_reviews(directory):
#     reviews = []
#     file_names = os.listdir(directory)
#     for file_name in file_names:
#         file_path = os.path.join(directory, file_name)
#         with open(file_path, 'r', encoding='utf-8') as file:
#             content = file.read()
#             preprocessed_content = preprocess_review(content)
#             reviews.append(preprocessed_content)
#     return reviews

# # Load and preprocess a few reviews from both directories to test
# pos_reviews_sample = load_and_preprocess_reviews(f"{extract_folder}/AIM-5001-main/pos")[:2]  # Load first 2 for testing
# neg_reviews_sample = load_and_preprocess_reviews(f"{extract_folder}/AIM-5001-main/neg")[:2]  # Load first 2 for testing

# pos_reviews_sample, neg_reviews_sample


In [None]:
# import pandas as pd

# # Function to load all reviews and return them with labels
# def load_all_reviews(pos_directory, neg_directory):
#     pos_reviews = load_and_preprocess_reviews(pos_directory)
#     neg_reviews = load_and_preprocess_reviews(neg_directory)
#     # Create a list of tuples (review, label)
#     labeled_reviews = [(review, 'POS') for review in pos_reviews] + [(review, 'NEG') for review in neg_reviews]
#     return labeled_reviews

# # Load all reviews from the directories
# all_reviews = load_all_reviews(f"{extract_folder}/AIM-5001-main/pos", f"{extract_folder}/AIM-5001-main/neg")

# # Construct a dictionary to count occurrences of each word in each review
# def create_term_document_matrix(reviews):
#     # Create a dictionary to hold the term-document data
#     term_doc_dict = {}
    
#     for review, label in reviews:
#         # Initialize a temporary dictionary to count words in the current review
#         word_count = {}
#         for word in review:
#             if word not in word_count:
#                 word_count[word] = 1
#             else:
#                 word_count[word] += 1
        
#         # Add the word counts to the main dictionary, handling multiple reviews
#         for word, count in word_count.items():
#             if word not in term_doc_dict:
#                 term_doc_dict[word] = [count if label == lbl else 0 for lbl in ['POS', 'NEG']]
#             else:
#                 term_doc_dict[word][0 if label == 'POS' else 1] += count
    
#     # Convert dictionary to DataFrame
#     term_document_matrix = pd.DataFrame.from_dict(term_doc_dict, orient='index', columns=['POS', 'NEG'])
    
#     return term_document_matrix

# # Generate the term-document matrix
# term_document_matrix = create_term_document_matrix(all_reviews)
# term_document_matrix.head()
