In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os
import pandas as pd
from nltk.tokenize import word_tokenize,sent_tokenize
import string
import re

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:


# Path to your project folder in Google Drive
project_folder = '/content/drive/My Drive/NLP Project/NLP Test Assignment'

# List all files in the folder
for filename in os.listdir(project_folder):
    print(filename)

Output Data Structure.xlsx
Input.xlsx
Text Analysis.docx
Objective.docx
StopWords
MasterDictionary
Extracted_text


In [4]:
# Load stop words
def load_stop_words(stopwords_dir):
    stopwords = set()
    for filename in os.listdir(stopwords_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(stopwords_dir, filename), 'r', encoding='iso-8859-1') as file:
                for line in file:
                    stopwords.add(line.strip().lower())
    return stopwords

# Path to the StopWords folder
stopwords_dir = os.path.join(project_folder, 'StopWords')
stopwords = load_stop_words(stopwords_dir)

In [5]:
# load master dictionary
def load_master_dictionary(master_dict_dir):
    positive_words = set()
    negative_words = set()

    with open(os.path.join(master_dict_dir, 'positive-words.txt'), 'r', encoding='iso-8859-1') as file:
        for line in file:
            if line.strip() and not line.startswith(';'):
                positive_words.add(line.strip().lower())

    with open(os.path.join(master_dict_dir, 'negative-words.txt'), 'r', encoding='iso-8859-1') as file:
        for line in file:
            if line.strip() and not line.startswith(';'):
                negative_words.add(line.strip().lower())

    return positive_words, negative_words


master_dict_dir = os.path.join(project_folder, 'MasterDictionary')
positive_words, negative_words = load_master_dictionary(master_dict_dir)

In [6]:
def clean_text(text, stopwords):
    tokens = word_tokenize(text)
    cleaned_tokens = [token.lower() for token in tokens if token.lower() not in stopwords and token.isalpha()]
    return cleaned_tokens

In [7]:
def calculate_scores(tokens, positive_words, negative_words):
    positive_score = sum(1 for token in tokens if token in positive_words)
    negative_score = sum(1 for token in tokens if token in negative_words)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

In [8]:
def syllable_count(word):
    word = word.lower()
    count = len(re.findall(r'[aeiouy]', word))
    if word.endswith(('es', 'ed')) and len(word) > 2:
        count -= 1
    return count if count > 0 else 1

In [9]:
def analyze_readability(text):
    sentences = sent_tokenize(text)
    words = [word for word in word_tokenize(text) if word.isalpha()]

    avg_sentence_length = len(words) / len(sentences)
    complex_word_count = sum(1 for word in words if syllable_count(word) > 2)
    percentage_complex_words = complex_word_count / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    return avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, len(words)


In [10]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE)
    return len(pronouns)


In [11]:
def average_word_length(text):
    words = [word for word in word_tokenize(text) if word.isalpha()]
    return sum(len(word) for word in words) / len(words)

In [12]:
# Output DataFrame
output_columns = [
    'URL_ID','POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX','AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]
output_df = pd.DataFrame(columns=output_columns)

In [15]:
extracted_text_dir = os.path.join(project_folder,'Extracted_text')

# List all files in the folder
for filename in os.listdir(extracted_text_dir):
  url_id = filename[:-4]
  text_path = os.path.join(extracted_text_dir,filename)
  with open(text_path,'r',encoding='iso-8859-1') as text:
    text_content = text.read()
  cleaned_tokens = clean_text(text_content,stopwords)
  positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(cleaned_tokens, positive_words, negative_words)
  avg_sentence_length, perc_complex_words, fog_index, complex_word_count, word_count = analyze_readability(text_content)
  personal_pronouns = count_personal_pronouns(text_content)
  avg_word_length = average_word_length(text_content)

  # creat new row with data
  new_row = pd.DataFrame({
            'URL_ID': [url_id],
            'POSITIVE SCORE': [positive_score],
            'NEGATIVE SCORE': [negative_score],
            'POLARITY SCORE': [polarity_score],
            'SUBJECTIVITY SCORE': [subjectivity_score],
            'AVG SENTENCE LENGTH': [avg_sentence_length],
            'PERCENTAGE OF COMPLEX WORDS': [perc_complex_words],
            'FOG INDEX': [fog_index],
            'AVG NUMBER OF WORDS PER SENTENCE': [avg_sentence_length],
            'COMPLEX WORD COUNT': [complex_word_count],
            'WORD COUNT': [word_count],
            'SYLLABLE PER WORD': [sum(syllable_count(word) for word in cleaned_tokens)],
            'PERSONAL PRONOUNS': [personal_pronouns],
            'AVG WORD LENGTH': [avg_word_length]
             })

  # Append the new row to the DataFrame
  output_df = pd.concat([output_df, new_row], ignore_index=True)





In [16]:
output_df

Unnamed: 0,URL_ID,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,bctech2011,130,43,0.502890,0.102976,13.915254,0.454730,5.747994,13.915254,1120,2463,5158,2,6.395859
1,bctech2012,19,5,0.583333,0.070796,9.038462,0.376596,3.766023,9.038462,177,470,998,1,6.527660
2,bctech2013,17,9,0.307692,0.062500,16.371429,0.349040,6.688187,16.371429,200,573,1122,1,6.153578
3,bctech2014,11,5,0.375000,0.047478,8.603774,0.410088,3.605545,8.603774,187,456,962,1,6.377193
4,bctech2015,15,1,0.875000,0.046243,19.137931,0.317117,7.782019,19.137931,176,555,993,1,5.821622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,12,9,0.142857,0.040698,25.411765,0.290509,10.280910,25.411765,251,864,1298,3,5.358796
143,bctech2154,17,17,0.000000,0.044271,23.224138,0.279881,9.401608,23.224138,377,1347,1977,7,5.122494
144,bctech2155,7,10,-0.176471,0.088542,24.937500,0.192982,10.052193,24.937500,77,399,462,14,4.759398
145,bctech2156,1,0,0.999999,0.011111,122.000000,0.319672,48.927869,122.000000,39,122,207,0,5.918033
