In [None]:
import requests
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import csv
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np


In [None]:
nlp = spacy.load('en_core_web_sm')

url = "https://wordsapiv1.p.rapidapi.com/words/{word}"
headers = {
    'x-RapidAPI-Key': "fbc51cbe2amsh1526fa2e0576cadp163480jsn7a8db0397fc4",
    'x-RapidAPI-Host': "wordsapiv1.p.rapidapi.com"
}


def get_word_info(word):
    response = requests.get(url.format(word=word), headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        return {}


In [None]:
def analyze_sentence(sentence):
    # Tokenize the sentence, remove stopwords and non-alphabetic words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(sentence)
    total_words = len(tokens)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
    zipf_scores = []

    # Fetch information for each word
    for word in filtered_tokens:
        word_info = get_word_info(word)
        if word_info:
            # The higher the zipf score, the more common the word is, and the more likely the message is not trustworthy
            zipf_score = word_info.get('frequency', 0)
            # print(f'word: {word}, zipf_score: {zipf_score}')
            zipf_scores.append(zipf_score)

    # Evaluate coherence based on word relations
    avg_zipf_score = sum(zipf_scores) / len(zipf_scores) if zipf_scores else 0
    # Taking the number of important words there were compared to the total number of words in the sentence
    # lexical_diversity = len(set(filtered_tokens)) / total_words if total_words > 0 else 0
    commonality_score = 1 / (1 + avg_zipf_score) if avg_zipf_score > 0 else 0
    adjusted_score = 1 - commonality_score

    return {
        # "filtered_tokens": filtered_tokens,
        # "zipf_scores": zipf_scores,
        "avg_zipf_score": avg_zipf_score,
        "commonality_score": commonality_score,
        "adjusted_score": adjusted_score
        # "lexical_diversity": lexical_diversity,
        # "sentence_length": total_words
    }

In [None]:
spam_ham = pd.read_csv('spam_ham_subset_analysis.csv', encoding='latin-1')
spam_ham.head()

Unnamed: 0,v1,v2,filtered_tokens,zipf_scores,avg_zipf_score,commonality_score,adjusted_score,lexical_diversity,sentence_length,combined_score,combined_adjusted_score
0,0,"Go until jurong point, crazy.. Available only ...","Go, jurong, point, crazy, Available, bugis, n,...","[6.62, 5.33, 5.47, 4.4, 5.14, 5.89, 5.74, 4.91...",4.96,0.17,0.83,0.67,24,0.57,0.7
1,0,Ok lar... Joking wif u oni...,"Ok, lar, Joking, wif, u, oni","[5.86, 2.5, 4.55, 5.09, 2.77]",4.15,0.19,0.81,0.75,8,0.64,0.76
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"Free, entry, wkly, comp, win, FA, Cup, final, ...","[5.3, 4.08, 0.0, 3.05, 5.16, 3.58, 4.73, 4.73,...",4.15,0.19,0.81,0.49,37,0.43,0.55
3,0,U dun say so early hor... U c already then say...,"U, dun, say, early, hor, U, c, already, say","[5.09, 2.91, 6.21, 5.07, 2.08, 5.09, 5.37, 5.6...",4.85,0.17,0.83,0.54,13,0.46,0.6
4,0,"Nah I don't think he goes to usf, he lives aro...","Nah, think, goes, usf, lives, around, though","[4.41, 6.38, 5.33, 5.17, 5.79, 5.28]",5.39,0.16,0.84,0.47,15,0.4,0.54


In [None]:
spam_ham.describe()

Unnamed: 0,v1,avg_zipf_score,commonality_score,adjusted_score,lexical_diversity,sentence_length,combined_score,combined_adjusted_score
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,0.2,4.9442,0.1696,0.8304,0.4266,21.84,0.375,0.5076
std,0.404061,0.506461,0.014978,0.014978,0.107658,11.564125,0.086714,0.08513
min,0.0,3.49,0.15,0.78,0.2,5.0,0.19,0.33
25%,0.0,4.57,0.16,0.82,0.3625,11.25,0.3225,0.4525
50%,0.0,4.955,0.17,0.83,0.43,22.0,0.37,0.51
75%,0.0,5.3775,0.18,0.84,0.4775,31.0,0.4175,0.54
max,1.0,5.7,0.22,0.85,0.75,48.0,0.64,0.76


In [None]:
# Describe for v1 groups for each column
# spam_ham.groupby('v1').describe()

for column in spam_ham.columns:
    print(f"Column: {column}")
    print(spam_ham.groupby('v1')[column].describe())
    print("\n")

Column: v1
    count  mean  std  min  25%  50%  75%  max
v1                                           
0    40.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
1    10.0   1.0  0.0  1.0  1.0  1.0  1.0  1.0


Column: v2
   count unique                                                top freq
v1                                                                     
0     40     40  Go until jurong point, crazy.. Available only ...    1
1     10     10  Free entry in 2 a wkly comp to win FA Cup fina...    1


Column: filtered_tokens
   count unique                                                top freq
v1                                                                     
0     40     40  Go, jurong, point, crazy, Available, bugis, n,...    1
1     10     10  Free, entry, wkly, comp, win, FA, Cup, final, ...    1


Column: zipf_scores
   count unique                                                top freq
v1                                                                     
0     40     40  [6.62, 