In [None]:
#  Mount Google Drive to the path so I can have access to google drive in google colab.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
n = 100 # set the number of words as a limit per file
# open the txt file and read it
with open('/content/drive/MyDrive/75texts/Οι κόρες της Αφροδίτης - Μιχάλης Πιτένης.pdf.v2.txt', "r", encoding='utf8') as file:
  single_line = file.read().replace("\n", " ") # read the file content and replace line breaks with spaces
    # print(single_line)

words = single_line.split(" ") # split the text in single words
parts = [words[i:i+n] for i in range(0, len(words), n)]  # divide the word list into parts, each with 'n' words

print("Initial split:", parts[0]) # print the first part for checking

print("All parts:", len(parts)) # print the total number of parts
print("Last part:", len(parts[-1])) # print the number of words in the last part

all_files = []  # create an empty list to save the txt file segments
# loop all parts except the last one
for i in range(len(parts[:-1])):
    file_sin_line = ""   # make an empty string to build the text of the current part
    for j in parts[i]:  # loop each word in the current part
      file_sin_line = file_sin_line + " " + j  # add each word with a space
    all_files.append(file_sin_line.strip())  # add the cleaned string to the list


In [None]:
import os

path = "./aphrodite_text_splits/" # make the path in which the txt files will be saved
os.makedirs(path, exist_ok=True) # create the directory

In [None]:
path = "./aphrodite_text_splits/"

# loop for each small text part we created previously
for i, f in enumerate(all_files):
  file_name = path+"file_"+ str(i) +".txt" # create a file name
  file = open(file_name, "w") # open the file to write to it
  file.write(f) # write the text into the file
  print(f"Saved:{file_name}")

In [None]:
# Calculate h-point from a list of word frequencies
def h_point(frequencies):

  # Sorting frequencies in descending order
  frequencies.sort(reverse=True)

  # Iterating over the list
  for i, f_c in enumerate(frequencies, start=1):

      # if frequency is equal to rank
      if i == f_c:
        hpoint=i # match between rank and frequency
        break

      # if frequency is less than rank
      elif i>f_c:
        fr1 = frequencies[i-2] # previous frequency
        r1 = i-1   # previous rank
        fr2 = f_c   # current frequency
        r2 = i     # current rank
        hpoint = ((fr1 * r2) - (fr2 * r1)) / ((r2 - r1) + (fr1 - fr2))
        break

  return hpoint

# calculating H-Index
def H_index(frequencies):

  # sorting in ascending order
  frequencies.sort()
  print(frequencies)
  # iterating over the list
  for i, freq in enumerate(frequencies):

      # finding current result
      result = len(frequencies) - i

      # if result is less than or equal to cited then return result
      if result <= freq:
          return result

  return 0

# calculate R1
def R1(frequencies, N, h_index, h_point):
  cumsum=0  # sum of frequencies up to h-index

  for i in range(0,h_index):
      cumsum+=frequencies[i]

# apply the R1 formula using total tokens N and h_point
  R1_res = 1 - ((cumsum/N) - (pow(h_point,2) / (2*N)))

  return R1_res


import math

# calculate entropy of word frequency distribution
def Entropy(frequencies, N):
  entr_sum = 0  # initialize entropy sum

  for i in range(len(frequencies)):
      p_i = frequencies[i]/N  # probability of word i
      entr_sum += p_i * math.log2(p_i)

  ent = - entr_sum   # final entropy - negative sum

  return ent

# calculate the Lambda - measures variation in word usage frequency
def Lambda_func(frequencies, N):
  L_sum = 0   # initialize sum

  # Loop for frequency pairs
  for i in range(len(frequencies)-1):
      res = pow((pow(frequencies[i]-frequencies[i+1], 2) + 1),1/2)
      L_sum += res

  # final Lambda value using log scaling
  lam = (L_sum * math.log10(N)) / N
  return lam

# calculate the average token length weighted by frequency
def ATL(dictionary, N):
  len_sum = 0

  for i, (key, value) in enumerate(dictionary.items()):
      len_sum += len(key)*value  # word length * number of times it appears

  atl = len_sum / N  # average token length = total length / total tokens

  return atl

In [None]:
import re
all_indeces = []  # make a list to store all index values for each text part

rr_data = {} # make an empty dictionary to save (V, N) pairs for each file

# loop each text segment and its index
for no, line in enumerate(all_files):
  file_name = f"file_{no:03d}.txt"  # create filename with format no:03d

  # Create an empty dictionary
  d = dict()

  # Remove the leading spaces and newline character
  line = line.strip()

  # Convert the characters in line to lowercase to avoid case mismatch
  line = line.lower()

  # replace " ' " with [space]
  line = line.replace("\'", " ")

  # replace " - " with [space]
  line = line.replace("-", " ")

  # Split the line into words
  words = line.split(" ")

  # Iterate over each word in line
  for word in words:
    if word == "":
      continue
      # Check for words in () or []
    if word.startswith('[') or word.startswith('('):
      word = word[1:]
    if word.endswith(']') or word.endswith(')'):
      word = word[:-1]

    # Check for words that end with punctuation
    if word.endswith('.') or word.endswith(',') or word.endswith(';') or word.endswith(':'):
      word = word[:-1]

    # Check if the word is already in dictionary
    if word in d:
      # Increment count of word by 1
      d[word] = d[word] + 1
    else:
      # Add the word to dictionary with count 1
      d[word] = 1

  # Sorting dictionary by word frequency in descending order
  sortedDict = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

  num_tokens = 0  # Total number of word tokens
  frequencies = []  # make a list of word frequencies

  # collect token and type stats
  for i, (key, value) in enumerate(sortedDict.items(), start=1):
    num_tokens += value
    num_types = i   # i becomes the total number of unique types
    frequencies.append(value)

  V = num_types  # number of unique words
  N = num_tokens  # Total number of words (tokens)
  TTR = V/N  # Type-Token Ratio

  h_p = h_point(frequencies) # compute h-point

  h_i = H_index(frequencies)  # compute h-index

  r1 = R1(frequencies, N, h_i, h_p) # compute R1

  entr = Entropy(frequencies, N) # compute entropy

  lamb = Lambda_func(frequencies, N) # compute lambda

  atl = ATL(sortedDict, N) # compute average token length

  rr_data[file_name] = (V, N) # save (V, N) for this file to calculate Repeat Rate later in the question 9

  # print all the index values for this file
  print(f"{file_name} → TTR: {TTR:}, H_point: {h_p:}, R1: {r1:}, Entropy: {entr:}, Lambda: {lamb:}, ATL: {atl:}")

  # store the results in a list for this file
  file_index = [no, TTR, h_p, r1, entr, lamb, atl]
  all_indeces.append(file_index) # add this file's results to the main list

In [None]:
import shutil

# create a zip file named "aphrodite.zip" from the "aphrodite_text_splits" folder to give it to the professor (requirement)
shutil.make_archive("aphrodite", 'zip', "/content/aphrodite_text_splits")

In [None]:
import pandas as pd
# put the column names for the output CSV and dataframe
labels = ["File_name", "TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]
print(labels) # print the column headers
print(all_indeces[2]) # print the index values of the 3rd file - preview

import csv  # import the CSV module for file writing

csv_filename = "indices_results.csv"

# open the file in write mode
with open(csv_filename, 'w', newline="") as file:
    csvwriter = csv.writer(file) # create a csvwriter object
    csvwriter.writerow(labels) # write the header
    csvwriter.writerows(all_indeces) # write the rest of the data

# create a pandas DataFrame from the list of results
df = pd.DataFrame(all_indeces, columns=labels)  # create the columns based on label names
print(df)

In [None]:
file_path = "indices_results.csv"

# read the CSV file into a DataFrame
texts_results = pd.read_csv(file_path, sep=',', decimal=",") # 'sep' : the column separator  | 'decimal': character used for decimal numbers

# print the shape of the Dataframe (rows, columns)
print("shape:\t", texts_results.shape)
texts_results.head() # show olny the first 5 rows of the df    # I made this format of Dataframe for clear preview - visibility for me, like your quantitative_indexes_lab.ipynb

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# make a list of text indices I want to analyze
indicators = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]

# loop for each index to create boxplots and show basic stats
for col in indicators:
  print(f"\n-Δείκτης: {col}")

  # print basic stats: min, max, median
  print("min\t", df[col].min())
  print("max\t", df[col].max())
  print("median\t", df[col].median())

  # calculate the 25th and 75th percentiles
  q75, q25 = np.percentile(df[col], [75 ,25])
  iqr = q75 - q25  # interquartile range

  # print quartiles and IQR
  print("q25\t", q25)
  print("q75\t", q75)
  print("iqr\t", iqr)

  # print bounds for detecting outliers
  print("‘minimum’\t", q25 - 1.5 * iqr)
  print("‘maximum’\t", q75 + 1.5 * iqr)

  # create a vertical boxplot for the current index
  df.boxplot(column=[col])
  plt.title(f"Κάθετο Boxplot για τον δείκτη {col}")
  plt.show()

  # create a horizontal boxplot for the same index
  df.boxplot(column=[col], vert=False)
  plt.title(f"Οριζόντιο Boxplot για τον δείκτη {col}")
  plt.show()

In [None]:
# calculate the mode of the H_point
mode_hpoint = df["H_point"].mode()
print("Επικρατούσα τιμή (mode) για τον δείκτη h-point:", mode_hpoint.values)

# print how many times each unique H_point value appears
print(df["H_point"].value_counts())

# create a bar chart to visualize the frequency of each H_point value (optional - for my preview)
df["H_point"].value_counts().plot(kind='bar', grid=True)

In [None]:
# load the CSV file which contains all index results
df = pd.read_csv("indices_results.csv")

# make a list of columns to find and analyze outliers
columns = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]

# loop each column to check for outliers
for col in columns:
  print(f"\n-Outliers για τον δείκτη: {col}")

  # calculate Q1, Q3, and IQR
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1

  # define lower and upper bounds for detecting outliers
  minimum = q1 - 1.5 * iqr
  maximum = q3 + 1.5 * iqr

  print(f"Q1: {q1:}, Q3: {q3:}, IQR: {iqr:}")
  print(f"Κατώτερο όριο: {minimum:}")
  print(f"Ανώτερο όριο: {maximum:}")

  # filter the DataFrame to find outliers (outside the bounds)
  outliers = df[(df[col] < minimum) | (df[col] > maximum)]

  # results based on detected outliers
  if len(outliers)==0:
    print("Δεν βρέθηκαν outliers για αυτόν τον δείκτη.")
  else:
    print(f"Βρέθηκαν {len(outliers)} outliers:")
    print(outliers[['File_name', col]])

In [None]:
# load the CSV file with the calculated indices
df = pd.read_csv("indices_results.csv")

indicators = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]

# make a list to save the final statistical results
results = []

# loop for each indicator
for col in indicators:
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3 - q1

  lower = q1 - 1.5 * iqr
  upper = q3 + 1.5 * iqr

  # filter the data to exclude outliers
  no_outliers = df[(df[col] >= lower) & (df[col] <= upper)]

  # calculate statistics on the cleaned data
  mean_value = no_outliers[col].mean()
  median_value = no_outliers[col].median()
  stand_dev_value = no_outliers[col].std()

  # save the results in a dictionary
  results.append({
      "Δείκτης": col,
      "Μέση τιμή": float(mean_value),
      "Διάμεση τιμή": median_value,
      "Τυπική απόκλιση": stand_dev_value
      })

print(results)

for result in results:
  print(
      f"\nΔείκτης: {result['Δείκτης']}\n"
      f"Μέση τιμή: {result['Μέση τιμή']}\n"
      f"Διάμεση τιμή: {result['Διάμεση τιμή']}\n"
      f"Τυπική απόκλιση: {result['Τυπική απόκλιση']}\n"
      )

In [None]:
# In this method I used sns.histplot with kde=True to plot both the histogram and the KDE together.
import matplotlib.pyplot as plt
import seaborn as sns

columns = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]

# loop for each index
for col in columns:
  plt.figure(figsize=(10, 5))   # create a new figure with specified size
  sns.histplot(df[col], bins=20, kde=True)   # create a histogram with a KDE (Kernel Density Estimation)
  plt.title(f"Ιστόγραμμα και συνάρτηση πυκνότητας πιθανότητας: {col}")
  plt.xlabel(col)
  plt.ylabel("Πυκνότητα")
  plt.show()

In [None]:
# (another method - optional) In this method I first plot the histogram and then add the KDE line using sns.kdeplot (I used documentation from https://seaborn.pydata.org/generated/seaborn.kdeplot.html)
import matplotlib.pyplot as plt
import seaborn as sns

columns = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]

# Loop for each index
for col in columns:
    plt.figure(figsize=(10, 5))  # Create new figure

    # Histogram that shows density (not count)
    sns.histplot(df[col], bins=20, stat="density")

    # KDE line in orange
    sns.kdeplot(df[col], color="orange")

    # Titles and labels
    plt.title(f"Ιστόγραμμα και συνάρτηση πυκνότητας πιθανότητας: {col}")
    plt.xlabel(col)
    plt.ylabel("Πυκνότητα")
    plt.show()

Based on the calculated **skewness**, most indices show values between -0.5 and 0.5, which, according to the rule of thumb, indicate fairly symmetrical distributions. Especially, TTR, H-point, R1, Entropy, Lambda and Average Token Length have skewness values within this range, which mean that their distributions are approximately symmetric, with no significant skew to the right or left. The positive skewness, that is noticed on H-point, R1 and ATL, implies a slight right-skewed distribution, while the negative one for TTR, Entropy and Lambda implies a slight left-skew. Overall, it is evident that the data are fairly symmetrical, without significant asymmetry.

In terms of **kurtosis** analysis, most indices have kurtosis close to zero and do not significantly exceed the threshold defined by the rule of thumb (|kurtosis| > 4 x √(6/703) ≈ 0.3695). This shows that the distributions are mainly mesokurtic which means that they have tails similar to a normal distribution. Especially, TTR, R1, Entropy, Lambda and ATL have mesokurtic distribution, meaning that there is a standard concentration of values without extreme outliers. However, H-point has a platykurtic distribution, meaning that its distribution has lighter tails relative to a normal distribution with fewer extreme values. Overall, the kurtosis results confirm that the data don't show heavy-tailed or extremely light-tailed distribution.

Regarding the **KDE** plot, for most indices (TTR, Entropy, Lambda and ATL) display a single peak and curve which indicate the **normal distribution.** However, H-point and R1 shows more irregular and multimodal shape which means that there is a deviation from normal distribution. Overall, the indices approximate the normal distrbution with minor deviations for specific cases ( H-point, R1).

In conclusion, the majority of the indices approximate normality, with fairly symmetrical shapes, mesokurtic and smooth KDE curves. Minor deviations (H point and R1) were noticed, but overall the data show consistency with normal distribution.

**You can see the following and above code results/analysis (question 7 and 8) to validate the above conclusion:**

In [None]:
from scipy.stats import kurtosis, skew    # I followed your code (professor's code) in Descriptive_Stats_lab.ipynb
import seaborn as sns

columns = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"]

# Loop for each column - index
for col in columns:
  print(f" - {col} ")

  # take the data for each column
  part1 = df[col]

  # do basic statistics
  res = part1.describe()
  print(res)

  part2 = part1 / res.get('count') # normalize data

  # compute skewness
  skewness = skew(part2)
  print("\nskewness:", skewness)

  if skewness < 0:
    print("****left-skewed distribution (negatively skewed)****\n")
  if skewness > 0:
    print("****right-skewed distribution (positively skewed)****\n")

In [None]:
from scipy.stats import kurtosis, skew

columns = ["TTR", "H_point", "R1", "Entropy", "Lambda", "ATL"] # define the index names

# Loop for finding kurtosis for each column
for col in columns:
  print(f"- {col} ")

  part1 = df[col] # take the data for each column
  res = part1.describe() # compute basic descriptive statistics
  print(res)
  part2 = part1/res.get('count') # normalize the data

  kurtosis_value = kurtosis(part1) # compute the kurtosis
  print("\nkurtosis:\t", kurtosis_value)

  thr = 4 * (6 / res.get('count'))**0.5 # calculate the rule of thumb threshold
  print("rule of thumb threshold:\t", thr)

  if abs(kurtosis_value) > thr:  # if the absolute kurtosis exceeds the threshold
    if kurtosis_value > 0:
      print("****leptokurtic(heavy tails)****\n")
    else:
      print("****platykurtic(light tails)****\n")
  else:
    print("****mesokurtic (normal tails)****\n")

In [None]:
# create a dictionary to save the final Repeat Rate for each file
rr_results = {}

# loop for each file and its (V, N) values
for file_name, (V, N) in rr_data.items():
    RR = (N - V) / V   # Repeat Rate: measures the level of repetition that exists in the text
    rr_results[file_name] = RR  # put the RR in the correct dictionary
    print(f"{file_name} - Repeat Rate (RR): {RR:}")