In [23]:
import json

with open("words_dictionary.json", "r") as words_file:
    most_words = json.load(words_file)

most_words = set(most_words.keys())
most_words

{'absohm',
 'childes',
 'misapprehension',
 'demure',
 'cicatriser',
 'prosecting',
 'supertemptation',
 'cimbri',
 'slicer',
 'footfault',
 'recrudescence',
 'homostylic',
 'congratulator',
 'cervelases',
 'subtotalled',
 'underclay',
 'chlorite',
 'oilberries',
 'radiolysis',
 'bouquetin',
 'rebaptismal',
 'recovered',
 'unnomadic',
 'melissa',
 'plashing',
 'adenin',
 'unfurnish',
 'antiradically',
 'reub',
 'dy',
 'billitonite',
 'aviatress',
 'rupturing',
 'hemorrhodin',
 'intrenchant',
 'potatory',
 'guepard',
 'decentralizing',
 'artocarpus',
 'mancipative',
 'taxying',
 'deliquesced',
 'zipa',
 'sodalites',
 'desi',
 'smoodge',
 'zoogamy',
 'truantcy',
 'imbordure',
 'enhancive',
 'grun',
 'nawab',
 'bloodshot',
 'implicative',
 'bedirter',
 'gerunds',
 'yellower',
 'halogenoid',
 'matrilineage',
 'autonomize',
 'sich',
 'haikai',
 'cryophorus',
 'shillyshallyer',
 'taurobolium',
 'commissioners',
 'hemiataxy',
 'bismark',
 'empathizes',
 'summitry',
 'trameled',
 'biajaiba',
 

In [36]:
with open("words.txt", "r") as words_file:
    common_words = words_file.read().splitlines()
common_words

['the',
 'of',
 'and',
 'to',
 'a',
 'in',
 'for',
 'is',
 'on',
 'that',
 'by',
 'this',
 'with',
 'i',
 'you',
 'it',
 'not',
 'or',
 'be',
 'are',
 'from',
 'at',
 'as',
 'your',
 'all',
 'have',
 'new',
 'more',
 'an',
 'was',
 'we',
 'will',
 'home',
 'can',
 'us',
 'about',
 'if',
 'page',
 'my',
 'has',
 'search',
 'free',
 'but',
 'our',
 'one',
 'other',
 'do',
 'no',
 'information',
 'time',
 'they',
 'site',
 'he',
 'up',
 'may',
 'what',
 'which',
 'their',
 'news',
 'out',
 'use',
 'any',
 'there',
 'see',
 'only',
 'so',
 'his',
 'when',
 'contact',
 'here',
 'business',
 'who',
 'web',
 'also',
 'now',
 'help',
 'get',
 'pm',
 'view',
 'online',
 'c',
 'e',
 'first',
 'am',
 'been',
 'would',
 'how',
 'were',
 'me',
 's',
 'services',
 'some',
 'these',
 'click',
 'its',
 'like',
 'service',
 'x',
 'than',
 'find',
 'price',
 'date',
 'back',
 'top',
 'people',
 'had',
 'list',
 'name',
 'just',
 'over',
 'state',
 'year',
 'day',
 'into',
 'email',
 'two',
 'health',
 '

In [43]:
def parse_text_words(text: str, stop_words: set[str] = {}) -> list[str]:
    words = (
        text.lower()
        .replace(".", "")
        .replace(",", "")
        .replace("!", "")
        .replace("?", "")
        .replace("-", " ")
        .replace("'s", "")
        .replace("'", "")
        .split()
    )
    
    final_words = []
    for word in words:
        if word not in stop_words:
            final_words.append(word)
    
    return final_words

In [94]:
texts = [
    "The study's limitations are openly discussed, recognizing potential shortcomings.",
    "i dont care about participant responses, just give me the data",
    "The research findings contribute to the theoretical foundation of the field.",
    "lol ur jokes r so lame"
]

In [78]:
def has_perfect_vocabulary(text: str, words: list[str]):
    local_words = parse_text_words(text)
    for word in local_words:
        if word not in words:
            return False
    
    return True

In [101]:
outputs = [f"{text} -> {has_perfect_vocabulary(text, most_words)}" for text in texts]
print("\n".join(outputs))

The study's limitations are openly discussed, recognizing potential shortcomings. -> True
i dont care about participant responses, just give me the data -> True
The research findings contribute to the theoretical foundation of the field. -> True
lol ur jokes r so lame -> False


In [69]:
def complexity_index(text: str, common_words: list[str], shift_right: int = 30):
    stop_words, words = common_words[:shift_right], common_words[shift_right:]
    local_words = parse_text_words(
        text, stop_words=stop_words
    )  # Ignore the 30 most common words which are mostly prepositions

    words_hash = set(words)
    words_size = len(words)

    total_score = 0
    not_appeared = 0
    for local_word in local_words:
        score = 0
        if local_word in words_hash:
            # 0 if this word is the most common of all
            # 1 if it is the rarest
            score = common_words.index(local_word) / words_size
        else:
            not_appeared += 1

        total_score += score
    
    # We are ignoring words which did not appear because they could be
    # a vocabulary mistake
    return total_score / (len(local_words) - not_appeared)


In [102]:
outputs = [f"{text} -> {complexity_index(text, common_words)}" for text in texts]
print("\n".join(outputs))

The study's limitations are openly discussed, recognizing potential shortcomings. -> 0.2493410381184104
i dont care about participant responses, just give me the data -> 0.16391817608362622
The research findings contribute to the theoretical foundation of the field. -> 0.2517910246012436
lol ur jokes r so lame -> 0.3321573398215734


### Sentiment Analysis

In [84]:
import nltk

nltk.download(["vader_lexicon"])

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mateusoliveirasantos/nltk_data...


True

In [85]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

In [104]:
outputs = [f"{text} -> {sia.polarity_scores(text)['neg']}" for text in texts]
print("\n".join(outputs))

The study's limitations are openly discussed, recognizing potential shortcomings. -> 0.0
i dont care about participant responses, just give me the data -> 0.226
The research findings contribute to the theoretical foundation of the field. -> 0.0
lol ur jokes r so lame -> 0.347


In [105]:
outputs = [f"{text} -> {sia.polarity_scores(text)['neu']}" for text in texts]
print("\n".join(outputs))

The study's limitations are openly discussed, recognizing potential shortcomings. -> 1.0
i dont care about participant responses, just give me the data -> 0.774
The research findings contribute to the theoretical foundation of the field. -> 1.0
lol ur jokes r so lame -> 0.192


In [106]:
outputs = [f"{text} -> {sia.polarity_scores(text)['pos']}" for text in texts]
print("\n".join(outputs))

The study's limitations are openly discussed, recognizing potential shortcomings. -> 0.0
i dont care about participant responses, just give me the data -> 0.0
The research findings contribute to the theoretical foundation of the field. -> 0.0
lol ur jokes r so lame -> 0.461
