#Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install -U pip setuptools wheel
!pip install -U spacy

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting setuptools
  Downloading setuptools-69.1.1-py3-none-any.whl (819 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 67.7.2
    Uninstalling setuptools-67.7.2:
      Successfully uninstalled setuptools-67.7.2
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0m

[0m

In [3]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.7.3
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-curated-transformers<0.3.0,>=0.2.0 (from en-core-web-trf==3.7.3)
  Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting curated-transformers<0.2.0,>=0.1.0 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_transformers-0.1.1-py2.py3-none-any.whl.metadata (965 bytes)
Collecting curated-tokenizers<0.1.0,>=0.0.9 (from spacy-curated-transformers<0.3.0,>=0.2.0->en-core-web-trf==3.7.3)
  Downloading curated_tokenizers-0.0.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading spacy_curated_transformers-0.2.2-py2.py3-none-any.whl (236 kB)
[2K   [90m━━━━━━━━━━

In [4]:
import spacy

#**Make df from sample text**

In [5]:
nlp =spacy.load('en_core_web_trf')

In [6]:
%%writefile english_demo_text.txt
"okay there is a flag. and it's young man trying to fly a kite. and this sailboat with two people. and one's waving. and someone's fishing off the pier. and puppy dog. and couple has in front of his house. picnicking in front of his house. and there's a car parked in front of the house. and they're in the garage. there's a tree in front of the house. there's small trees leading down to the way way way out."

Writing english_demo_text.txt


# General Methods

In [7]:
def filterText(text):
  import nltk
  # this tokeniser takes care of contractions nicely
  from nltk.tokenize import WhitespaceTokenizer
  # Create a reference variable for Class WhitespaceTokenizer
  tk = WhitespaceTokenizer()
  trackWords = tk.tokenize(text)
  #removing unwanted characters, excluding contractions
  bad_chars = [',', ':', '!', '\"', '?']
  filteredTrackWords = [''.join(filter(lambda i: i not in bad_chars, word)) for word in trackWords]
  #remove empty words
  filteredTrackWords = [i for i in filteredTrackWords if i]
  return " ".join(filteredTrackWords)

In [8]:
f = open("english_demo_text.txt","r")
doc = nlp(filterText(f.read()))

In [9]:
def setupSpacyDF(doc):
  import pandas as pd
  cols = ("text",  "POS", "STOP", "DEP")
  rows = []
  for t in doc:
    if t.pos_ == 'PUNCT':
      #not considering punctuation
      continue
    row = [t.text, t.pos_, t.is_stop, t.dep_]
    rows.append(row)
  df = pd.DataFrame(rows, columns=cols)
  return df

In [10]:
print(list(setupSpacyDF(doc)['text']))

['okay', 'there', 'is', 'a', 'flag', 'and', 'it', "'s", 'young', 'man', 'trying', 'to', 'fly', 'a', 'kite', 'and', 'this', 'sailboat', 'with', 'two', 'people', 'and', 'one', "'s", 'waving', 'and', 'someone', "'s", 'fishing', 'off', 'the', 'pier', 'and', 'puppy', 'dog', 'and', 'couple', 'has', 'in', 'front', 'of', 'his', 'house', 'picnicking', 'in', 'front', 'of', 'his', 'house', 'and', 'there', "'s", 'a', 'car', 'parked', 'in', 'front', 'of', 'the', 'house', 'and', 'they', "'re", 'in', 'the', 'garage', 'there', "'s", 'a', 'tree', 'in', 'front', 'of', 'the', 'house', 'there', "'s", 'small', 'trees', 'leading', 'down', 'to', 'the', 'way', 'way', 'way', 'out']


#Easy feats

In [11]:
def posTagCounts(df, linguisticFeatures):
  # 5.20.22: This line is problematic
  #tag_types = list(df['POS'].unique())
  #solution: scroll to bottom of https://stackoverflow.com/questions/58215855/how-to-get-full-list-of-pos-tag-and-dep-in-spacy
  tag_types = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "SCONJ", "SYM", "VERB", "X", "SPACE"]
  for category in tag_types:
    tag_description = "POS_TAG:" + category
    linguisticFeatures[tag_description] = (df['POS'] == category).sum()

In [12]:
def openClosedRatio(df, linguisticFeatures):
  open_class = {'VERB', 'NOUN', 'ADJ', 'ADV'}
  tag_types = list(df['POS'].unique())
  open_words = 0
  closed_words = 0
  for category in tag_types:
    if category in open_class:
      open_words += (df['POS'] == category).sum()
    else:
      closed_words += (df['POS'] == category).sum()
  ratio = open_words/closed_words if closed_words else "NaN"
  linguisticFeatures["Open to Closed Words Ratio"] = ratio
# CHECK EDGE CASE


POS tags are consistent across languages

In [13]:
def posNormalizedCounts(df, linguisticFeatures):
  #tag_types = ["ADJ", "ADP", "ADV", "AUX", "CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "SCONJ", "SYM", "VERB", "X", "SPACE"]
  filtered_tag_types = ["ADJ", "ADP", "ADV", "AUX","CONJ", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "SCONJ", "VERB"]
  #for category in tag_types:
  #DECIDE
  for category in filtered_tag_types:
    tag_description = "POS_TAG:" + category
    linguisticFeatures[tag_description] = (df['POS'] == category).sum()/df.shape[0]

#Dependency tags

In [14]:
#source: https://spacy.io/models/en#en_core_web_trf-labels
englishDepTags = ['ROOT', 'acl', 'acomp','advcl','advmod','agent','amod','appos','attr','aux','auxpass','case','cc','ccomp','compound','conj','csubj','csubjpass','dative','dep','det','dobj','expl','intj','mark','meta','neg','nmod','npadvmod','nsubj','nsubjpass','nummod','oprd','parataxis','pcomp','pobj','poss','preconj','predet','prep','prt','punct','quantmod','relcl','xcomp']

In [15]:
#source: https://spacy.io/models/zh#zh_core_web_trf-labels
chineseDepTags = ["ROOT", "acl", "advcl:loc", "advmod", "advmod:dvp", "advmod:loc","advmod:rcomp", "amod", "amod:ormod", "appos", "aux:asp", "aux:ba", "aux:modal", "aux:prtmod", "auxpass", "case", "cc", "ccomp", "compound:nn", "compound:vc", "conj", "cop", "dep", "det", "discourse", "dobj", "etc","mark","mark:clf", "name", "neg","nmod","nmod:assmod","nmod:poss","nmod:prep", "nmod:range", "nmod:tmod", "nmod:topic", "nsubj", "nsubj:xsubj", "nsubjpass","nummod", "parataxis:prnmod", "punct", "xcomp"]

In [16]:
advmodSpecificChineseTags = ["advmod:dvp", "advmod:loc","advmod:rcomp"]
amodSpecificChineseTags = ["amod:ormod"]
auxSpecificChineseTags = ["aux:asp", "aux:ba", "aux:modal", "aux:prtmod","auxpass"]
compoundSpecificChineseTags = ["compound:nn", "compound:vc"]
markSpecificChineseTags = ["mark:clf"]
nmodSpecificChineseTags = ["nmod:assmod","nmod:poss","nmod:prep", "nmod:range", "nmod:tmod", "nmod:topic"]
nsubjSpecificChineseTags = ["nsubj:xsubj"]
nummodSpecificChineseTags = ["nummod"]

In [17]:
chineseSimplifiedDepTags = []
for tag in chineseDepTags:
  if tag in advmodSpecificChineseTags:
    chineseSimplifiedDepTags.append("advmod")
  elif tag in amodSpecificChineseTags:
    chineseSimplifiedDepTags.append("amod")
  elif tag in auxSpecificChineseTags:
    chineseSimplifiedDepTags.append("aux")
  elif tag in compoundSpecificChineseTags:
    chineseSimplifiedDepTags.append("compound")
  elif tag in markSpecificChineseTags:
    chineseSimplifiedDepTags.append("mark")
  elif tag in nmodSpecificChineseTags:
    chineseSimplifiedDepTags.append("nmod")
  elif tag in nsubjSpecificChineseTags:
    chineseSimplifiedDepTags.append("nsubj")
  elif tag in nummodSpecificChineseTags:
    chineseSimplifiedDepTags.append("nummod")
  else:
    chineseSimplifiedDepTags.append(tag)

In [18]:
chineseSimplifiedDepTags = set(chineseSimplifiedDepTags)
print(sorted(chineseSimplifiedDepTags))

['ROOT', 'acl', 'advcl:loc', 'advmod', 'amod', 'appos', 'aux', 'case', 'cc', 'ccomp', 'compound', 'conj', 'cop', 'dep', 'det', 'discourse', 'dobj', 'etc', 'mark', 'name', 'neg', 'nmod', 'nsubj', 'nsubjpass', 'nummod', 'parataxis:prnmod', 'punct', 'xcomp']


In [19]:
print(sorted(englishDepTags))

['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'amod', 'appos', 'attr', 'aux', 'auxpass', 'case', 'cc', 'ccomp', 'compound', 'conj', 'csubj', 'csubjpass', 'dative', 'dep', 'det', 'dobj', 'expl', 'intj', 'mark', 'meta', 'neg', 'nmod', 'npadvmod', 'nsubj', 'nsubjpass', 'nummod', 'oprd', 'parataxis', 'pcomp', 'pobj', 'poss', 'preconj', 'predet', 'prep', 'prt', 'punct', 'quantmod', 'relcl', 'xcomp']


In [20]:
overlapChineseEnglishDepTags = chineseSimplifiedDepTags.intersection(set(englishDepTags))
print(overlapChineseEnglishDepTags)

{'punct', 'nsubj', 'nmod', 'neg', 'nsubjpass', 'conj', 'det', 'dobj', 'dep', 'ccomp', 'advmod', 'ROOT', 'appos', 'amod', 'cc', 'xcomp', 'compound', 'mark', 'nummod', 'acl', 'aux', 'case'}


In [21]:
def depTagCounts(df, linguisticFeatures):
  overlapChineseEnglishDepTags = chineseSimplifiedDepTags.intersection(set(englishDepTags))
  #English:tag_types = ["acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "cop", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nn", "npmod", "nsubj", "nsubjpass", "oprd", "obj", "obl", "pcomp", "pobj", "poss", "preconj", "prep", "prt", "punct",  "quantmod", "relcl", "root", "xcomp"]
  for category in overlapChineseEnglishDepTags:
    depTagDescription = "GrammarRelation:" + category
    linguisticFeatures[depTagDescription] = (df['DEP'] == category).sum()/df.shape[0]

In [22]:
complexTagsInEnglishGCI = ['csubj', 'csubjpass', 'ccomp', 'xcomp', 'acomp', 'pobj', 'advcl', 'mark', 'acl', 'nounmod', 'complm', 'infmod', 'partmod', 'nmod']

In [23]:
chineseSimplifiedDepTags.intersection(set(complexTagsInEnglishGCI))

{'acl', 'ccomp', 'mark', 'nmod', 'xcomp'}

In [24]:
overlapChineseEnglishDepTags.intersection(set(complexTagsInEnglishGCI))

{'acl', 'ccomp', 'mark', 'nmod', 'xcomp'}

In [25]:
def grammarComplexityIndex(df, linguisticFeatures):
  complexTags = chineseSimplifiedDepTags.intersection(set(complexTagsInEnglishGCI))
  numComplexTags = 0
  for tag in complexTags:
    numComplexTags += (df['DEP'] == tag).sum()
  totalNumTags = len(df['DEP'])
  grammarComplexity = numComplexTags/totalNumTags
  linguisticFeatures['Grammar Complexity Index'] = grammarComplexity

#propositional density

In [26]:
def propositionalDensity(df, linguisticFeatures):
  pdTags = ['VERB', 'ADJ', 'ADV', 'ADP', 'CONJ', 'CCONJ', 'SCONJ']
  numPDTags = 0
  for tag in pdTags:
    numPDTags += (df['POS'] == tag).sum()
  totalNumPDTags = len(df['POS'])
  propositionalDensity = numPDTags/totalNumPDTags
  linguisticFeatures['Propositional Density'] = propositionalDensity

#Type token ration

In [27]:
def typeTokenRatio(df, linguisticFeatures):
  numTokens = len(df)
  numTypes = len(set(df['text'].str.lower()))
  linguisticFeatures["# of unique words"] = numTypes
  typeTokenRatio = numTypes/numTokens
  linguisticFeatures["Type Token Ratio"] = typeTokenRatio

In [28]:
def mov_avg_type_token_ratio(df, window_size=50):
  sum_type_token_ratio = 0
  numWindows = 0
  window_start = 0
  window_end = window_size
  while(window_end <= len(df)):
    currentWindow = df['text'][window_start:window_end]
    numTypes = len(set(df['text'][window_start:window_end].str.lower()))
    window_type_token_ratio = numTypes/window_size
    sum_type_token_ratio += window_type_token_ratio
  #update variables for next iteration
    numWindows +=1
    window_start +=1
    window_end +=1
  if numWindows == 0:
    return pd.NA
  moving_average_type_token_ratio = sum_type_token_ratio/numWindows
  return moving_average_type_token_ratio

In [29]:
def determine_repetition(currentWord, words):
  for word in words:
      if (word != currentWord):
        return False
  return True

In [30]:
def repetition_of_word_n(n,df):
  numRepetitions = 0
  first_word = df['text'][0]
  current_word = first_word
  current_word_index = 0
  next_window_start_index = 1
  next_window_end_index = n
  next_words = None
  while(next_window_end_index < len(df)):
    nextWords = list(df['text'][next_window_start_index:next_window_end_index])
    #print(nextWords)
    if (determine_repetition(current_word, nextWords)):
      numRepetitions += 1
    #update for next iteration
    current_word = nextWords[0]
    current_word_index +=1
    next_window_start_index +=1
    next_window_end_index +=1
  return numRepetitions

#Stopwords

In [31]:
!pip install stopwordsiso


Collecting stopwordsiso
  Downloading stopwordsiso-0.6.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1
[0m

In [32]:
import stopwordsiso
from stopwordsiso import stopwords
englishStopWords = stopwords("en")  # Chinese
print(englishStopWords)
len(englishStopWords)

{'whole', 'particular', 'reserved', 'evenly', 'respectively', "wasn't", 'various', 'sometimes', 'numbers', 'plus', 'furthers', 'kr', 'mh', 'today', 'thence', 'inner', 'np', 'during', 'ar', 'system', 'dear', 'sincere', 'somewhat', 'z', 'ai', 'call', 'other', 'zero', 'ring', 'just', 'its', "wouldn't", 'u', 'unfortunately', 'empty', 'about', "why'll", 'ableabout', 'opened', 'thoughh', 'hither', 'big', 'ups', 'widely', 'with', 'behind', 'itd', 'longer', 'farther', 'novel', 'move', 'related', 'looks', 'accordance', 'allow', 'ls', 'sa', 'cm', 'contain', 'none', 'ordered', 'sk', 'b', 'former', "needn't", 'off', 'interesting', "that've", 'away', 'af', 'lc', 'seven', 'refs', 'suggest', 'himself', 'greetings', 'lt', 'higher', 'ca', 'need', 'possible', 'homepage', "why's", 'anywhere', "'ll", 'but', "she'll", 'of', 'fify', 'abroad', 'til', 'welcome', 'has', "mustn't", 'ki', 'specified', 'inasmuch', 'mm', 'looking', 'cn', 'says', 'part', 'around', 'others', 'unlikely', 'youd', 'whereby', 'never', '

1298

In [33]:
[len(stopword) for stopword in englishStopWords]

[5,
 10,
 8,
 6,
 12,
 6,
 7,
 9,
 7,
 4,
 8,
 2,
 2,
 5,
 6,
 5,
 2,
 6,
 2,
 6,
 4,
 7,
 8,
 1,
 2,
 4,
 5,
 4,
 4,
 4,
 3,
 8,
 1,
 13,
 5,
 5,
 6,
 9,
 6,
 7,
 6,
 3,
 3,
 6,
 4,
 6,
 3,
 6,
 7,
 5,
 4,
 7,
 5,
 10,
 5,
 2,
 2,
 2,
 7,
 4,
 7,
 2,
 1,
 6,
 7,
 3,
 11,
 7,
 4,
 2,
 2,
 5,
 4,
 7,
 7,
 9,
 2,
 6,
 2,
 4,
 8,
 8,
 5,
 8,
 3,
 3,
 6,
 2,
 4,
 6,
 3,
 7,
 3,
 7,
 2,
 9,
 8,
 2,
 7,
 2,
 4,
 4,
 6,
 6,
 8,
 4,
 7,
 5,
 11,
 6,
 8,
 4,
 6,
 3,
 4,
 4,
 5,
 4,
 7,
 8,
 6,
 5,
 5,
 2,
 11,
 2,
 3,
 9,
 6,
 5,
 8,
 2,
 3,
 6,
 6,
 5,
 2,
 9,
 7,
 7,
 2,
 2,
 5,
 7,
 6,
 4,
 2,
 4,
 2,
 2,
 7,
 3,
 2,
 6,
 5,
 8,
 3,
 4,
 2,
 5,
 5,
 6,
 5,
 4,
 4,
 5,
 2,
 10,
 4,
 8,
 2,
 4,
 4,
 6,
 7,
 7,
 5,
 6,
 3,
 6,
 7,
 5,
 2,
 2,
 7,
 2,
 4,
 2,
 9,
 4,
 8,
 9,
 2,
 4,
 2,
 7,
 4,
 4,
 4,
 4,
 5,
 5,
 4,
 2,
 7,
 4,
 10,
 2,
 7,
 5,
 7,
 7,
 3,
 8,
 8,
 6,
 7,
 4,
 2,
 4,
 6,
 7,
 6,
 2,
 5,
 4,
 6,
 2,
 6,
 4,
 1,
 6,
 6,
 6,
 9,
 6,
 2,
 7,
 2,
 6,
 4,
 3,
 2,
 2,
 5,
 4,
 6,
 6,

In [34]:
englishStopWords = stopwords("en")
print(englishStopWords)
len(englishStopWords)

{'whole', 'particular', 'reserved', 'evenly', 'respectively', "wasn't", 'various', 'sometimes', 'numbers', 'plus', 'furthers', 'kr', 'mh', 'today', 'thence', 'inner', 'np', 'during', 'ar', 'system', 'dear', 'sincere', 'somewhat', 'z', 'ai', 'call', 'other', 'zero', 'ring', 'just', 'its', "wouldn't", 'u', 'unfortunately', 'empty', 'about', "why'll", 'ableabout', 'opened', 'thoughh', 'hither', 'big', 'ups', 'widely', 'with', 'behind', 'itd', 'longer', 'farther', 'novel', 'move', 'related', 'looks', 'accordance', 'allow', 'ls', 'sa', 'cm', 'contain', 'none', 'ordered', 'sk', 'b', 'former', "needn't", 'off', 'interesting', "that've", 'away', 'af', 'lc', 'seven', 'refs', 'suggest', 'himself', 'greetings', 'lt', 'higher', 'ca', 'need', 'possible', 'homepage', "why's", 'anywhere', "'ll", 'but', "she'll", 'of', 'fify', 'abroad', 'til', 'welcome', 'has', "mustn't", 'ki', 'specified', 'inasmuch', 'mm', 'looking', 'cn', 'says', 'part', 'around', 'others', 'unlikely', 'youd', 'whereby', 'never', '

1298

Stopwords: words that are commonly used in an language (https://kavita-ganesan.com/what-are-stop-words/)

In [35]:
def numStopWords(df, stopWords=englishStopWords):
  result = 0
  for word in list(df['text']):
    if word in stopWords:
      result+=1
  return result/len(df)

In [36]:
def averageSentenceLength(df, doc, linguisticFeatures):
  numSentences = len(list(doc.sents))
  #numWords = len(df)
  #both implementations achieve same result for demo sample
  averageSentenceLength = len(df)/numSentences#numSentences/len(df)

  # for sent in doc.sents:
  #   numWords += len(sent)
  #avgSentenceLength = numWords/numSentences
  linguisticFeatures['# of words'] = len(df['POS'])#, corrected on 6.25.2022
  #linguisticFeatures['# of words'] = numWords
  linguisticFeatures["Average sentence length"] = averageSentenceLength

#**Word Length by Morphemes**

In [37]:
!pip install polyglot
#!sudo apt-get install python-numpy libicu-dev
!pip install pyicu #problematic on terminal: use conda install -c conda-forge pyicu
!pip install pycld2
!pip install morfessor
from polyglot.downloader import downloader
from polyglot.text import Text, Word
!polyglot download morph2.en

Collecting polyglot
  Downloading polyglot-16.7.4.tar.gz (126 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m122.9/126.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.3/126.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: polyglot
  Building wheel for polyglot (setup.py) ... [?25l[?25hdone
  Created wheel for polyglot: filename=polyglot-16.7.4-py2.py3-none-any.whl size=52558 sha256=cc6f96846704d46bfcd6ece126c85e5157eef96e0095623f39a13aae65af2a15
  Stored in directory: /root/.cache/pip/wheels/aa/92/4a/b172589446ba537db3bdb9a1f2204f27fe71217981c14ac368
Successfully built polyglot
Installing collected packages: polyglot
Successfully installed polyglot-16.7.4
[0mCollecting

In [38]:
def wordLengthByMorphemes(df,linguisticFeatures):
  wordsLengthByMophemes = [len(Word(word, language="en").morphemes) for word in list(df['text'])]
  linguisticFeatures["Word Length by Morphemes Mean"] = np.mean(wordsLengthByMophemes)
  linguisticFeatures["Word Length by Morphemes Standard Deviation"] = np.std(wordsLengthByMophemes)
  #linguisticFeatures["Max Word Length by Morphemes"] = np.max(wordsLengthByMophemes)
  #linguisticFeatures["Min Word Length by Morphemes"] = np.min(wordsLengthByMophemes)

#Word length by phoneme

In [39]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

model = T5ForConditionalGeneration.from_pretrained('charsiu/g2p_multilingual_byT5_small_100') # 'charsiu/g2p_multilingual_byT5_tiny_16_layers_100'

tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

In [40]:
# tokenized English words
words = ['Char', 'siu', 'is', 'a', 'Cantonese', 'style', 'of', 'barbecued', 'pork']
words = ['<eng-us>: '+i for i in words]
out = tokenizer(words,padding=True,add_special_tokens=False,return_tensors='pt')
preds = model.generate(**out,num_beams=1,max_length=50) # We do not find beam search helpful. Greedy decoding is enough.
phones = tokenizer.batch_decode(preds.tolist(),skip_special_tokens=True)
print(phones)
# Output: ['ˈtʃɑɹ', 'ˈsiw', 'ˈɪs', 'ˈɑ', 'ˈkæntəˌniz', 'ˈstaɪɫ', 'ˈɑf', 'ˈbɑɹbɪkˌjud', 'ˈpɔɹk']


['ˈtʃɑɹ', 'ˈʃu', 'ˈɪs', 'ˈeɪ', 'ˌkæntəˈniz', 'ˈstaɪɫ', 'ˈəf', 'ˈbɑɹbɪkˌjud', 'ˈpɔɹk']


In [41]:
def wordLengthByPhonemes(df,linguisticFeatures):
  words = list(df['text'])
  words = ['<eng-us>: '+i for i in words]
  out = tokenizer(words,padding=True,add_special_tokens=False,return_tensors='pt')
  preds = model.generate(**out,num_beams=1,max_length=50) # We do not find beam search helpful. Greedy decoding is enough.
  phones = tokenizer.batch_decode(preds.tolist(),skip_special_tokens=True)
  linguisticFeatures["Word Length by Phonemes Mean"] = np.mean([len(i) for i in phones])
  linguisticFeatures["Word Length by Phonemes Standard Deviation"] = np.std([len(i) for i in phones])
  #linguisticFeatures["Max Word Length by Morphemes"] = np.max(wordsLengthByMophemes)
  #linguisticFeatures["Min Word Length by Morphemes"] = np.min(wordsLengthByMophemes)

#**Word Frequency**

In [42]:
import pandas as pd

In [43]:
wordFreqDF = pd.read_csv('SUBTLEX-EN-WF.csv')
wordFreqDF.head()

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF,SUBTLCD,Lg10CD
0,the,1501908,8388,1339811,8388,29449.18,6.1766,100.0,3.9237
1,to,1156570,8383,1138435,8380,22677.84,6.0632,99.94,3.9235
2,a,1041179,8382,976941,8380,20415.27,6.0175,99.93,3.9234
3,you,2134713,8381,1595028,8376,41857.12,6.3293,99.92,3.9233
4,and,682780,8379,515365,8374,13387.84,5.8343,99.89,3.9232


In [44]:
englishWordFreqProbs = dict(zip(wordFreqDF['Word'], wordFreqDF['Lg10WF']))

In [45]:
englishWordContextualDiversity = dict(zip(wordFreqDF['Word'], wordFreqDF['Lg10CD']))

In [46]:
def wordFreqMetrics(df,linguisticFeatures):
  openClassPOSTags = ['NOUN', 'VERB', 'ADJ', 'ADV']
  openClassDF = df[df.POS.isin(openClassPOSTags)]
  wordFreqVals = []
  wordCDVals = []
  for word in list(openClassDF['text']):
    wordFreqVal = englishWordFreqProbs.get(word)
    if wordFreqVal != None:
      wordFreqVals.append(wordFreqVal)
    wordCD = englishWordContextualDiversity.get(word)
    if wordCD != None:
      wordCDVals.append(wordCD)
  linguisticFeatures["Word Frequency Mean"] = np.nanmean(wordFreqVals)
  linguisticFeatures["Word Frequency Standard Deviation"] = np.nanstd(wordFreqVals)
  linguisticFeatures["Word Contextual Diversity Mean"] = np.nanmean(wordCDVals)
  linguisticFeatures["Word Contextual Diversity Standard Deviation"] = np.nanstd(wordCDVals)

#**Age of Acquisition**

In [47]:
import pandas as pd

Source: https://link.springer.com/article/10.3758/s13428-020-01455-8

In [48]:
ageOfAcquistionDF = pd.read_csv('AoA_ratings_Kuperman_et_al_BRM.csv')
ageOfAcquistionDF.head()

Unnamed: 0,Word,OccurTotal,OccurNum,Freq_pm,Rating.Mean,Rating.SD,Dunno
0,a,22.0,22.0,20415.27,2.89,1.21,1.0
1,aardvark,18.0,18.0,0.41,9.89,3.66,1.0
2,abacus,20.0,13.0,0.24,8.69,3.77,0.65
3,abalone,18.0,13.0,0.51,12.23,3.54,0.72
4,abandon,19.0,19.0,8.1,8.32,2.75,1.0


In [49]:
ageOfAcquistionMeanMapping = dict(zip(list(ageOfAcquistionDF['Word']), list(ageOfAcquistionDF['Rating.Mean'])))

In [50]:
ageOfAcquistionSDMapping = dict(zip(list(ageOfAcquistionDF['Word']), list(ageOfAcquistionDF['Rating.SD'])))

In [51]:
def compute_AoA_MappingForWordList(df,linguisticFeatures):
  wordsAOAMeanList = []
  wordsAOASDList = []
  openClassPOSTags = ['NOUN', 'VERB', 'ADJ', 'ADV']
  openClassDF = df[df.POS.isin(openClassPOSTags)]
  for word in list(openClassDF['text']):
    wordMeanAOA = ageOfAcquistionMeanMapping.get(word.lower())
    if wordMeanAOA == None:
      wordsAOAMeanList.append(np.nan)
      continue
    wordsAOAMeanList.append(float(wordMeanAOA))
    wordAOASD = ageOfAcquistionSDMapping.get(word.lower())
    if wordAOASD == None:
      wordsAOASDList.append(np.nan)
      continue
    wordsAOASDList.append(float(wordAOASD))
  linguisticFeatures["Age of Acquistion Mean Average"] = np.nanmean(wordsAOAMeanList)
  linguisticFeatures["Age of Acquistion SD Average"] = np.nanmean(wordsAOASDList)

#Yngve and frazier

In [52]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.8.0-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.10.1-py2.py3-none-any.whl.metadata (5.3 kB)
Downloading stanza-1.8.0-py3-none-any.whl (970 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m970.4/970.4 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.10.1-py2.py3-none-any.whl (421 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.10.1 stanza-1.8.0
[0m

In [53]:
import stanza

In [54]:
stanza.download('en') # download Chinese simplified
pipe = stanza.Pipeline(lang='en', processors='tokenize,  pos, lemma, depparse, constituency') # initialize Chinese neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [55]:
def get_yngve_score(TreeNode, score, leaf_num, total_score):
  if TreeNode.is_leaf():
    total_score += score
  else:
    num_children = len(TreeNode.children)
    for i in range(0, num_children):
      total_score = get_yngve_score(TreeNode.children[i], score + (num_children - 1) - i, i, total_score)
  return total_score

In [56]:
def get_frazier_score(node, my_score, total_score, sent_parent):
  if node.is_leaf():
    sent_val = 1.5 if sent_parent else 0
    #print("Leaf Node: ", node, " Gets Score: ", my_score + sent_val)
    total_score = total_score + my_score + sent_val
  else:
    num_children = len(node.children)
    for i in range(0, num_children):
      sent_bool = True if (node.label == "S" and i == 0) or sent_parent else False
      sibling_num = 0 if ((i == (num_children - 1)) and (num_children != 1)) else 1
      total_score = get_frazier_score(node.children[i], sibling_num, total_score, sent_bool)
  return total_score

In [57]:
def yngve_fraz_score_normalized(text,linguisticFeatures):
  numWords = 0
  textYngveScore = 0
  textFrazierScore = 0
  info_map = {}
  for sent in text.sentences:
    stripped_sent = [word.text for word in sent.words if word.upos != "PUNCT"]
    preprocessed = ' '.join(stripped_sent)
    if len(pipe(preprocessed).sentences) ==0:
      continue
    clean_sent = pipe(preprocessed).sentences[0];
    sent_tree_root = clean_sent.constituency.children[0]
    score = 0
    yngve_score = get_yngve_score(sent_tree_root, score, 0, total_score=0)
    #print("Yngve score:", yngve_score)
    fraz_score = get_frazier_score(sent_tree_root, 0, total_score=0, sent_parent=False)
    #print("Frazier score:", fraz_score)
    info_map[sent.text] = [clean_sent.constituency, yngve_score, fraz_score]
    textYngveScore += yngve_score
    textFrazierScore += fraz_score
    numWords += len(stripped_sent)
  linguisticFeatures["Yngve"] = textYngveScore/numWords
  linguisticFeatures["Frazier"] = textFrazierScore/numWords

#Run everything

In [58]:
def extractLinguisticFeatures(transcription, sampleName):
  import spacy
  nlp =spacy.load('en_core_web_trf')
  doc = nlp(filterText(transcription))
  df = setupSpacyDF(doc)
  linguisticFeatures = {}
  posNormalizedCounts(df, linguisticFeatures)
  linguisticFeatures['Noun to Verb ratio'] = (df['POS'] == 'NOUN').sum()/(df['POS'] == 'VERB').sum()
  depTagCounts(df, linguisticFeatures)
  grammarComplexityIndex(df, linguisticFeatures)
  propositionalDensity(df, linguisticFeatures)
  typeTokenRatio(df, linguisticFeatures)
  wordLengthByMorphemes(df,linguisticFeatures)
  wordLengthByPhonemes(df,linguisticFeatures)
  linguisticFeatures["Moving Average Type Token Ratio"] = mov_avg_type_token_ratio(df, window_size=50)
  linguisticFeatures["Repetitions of word twice"] = repetition_of_word_n(2,df)
  linguisticFeatures["StopwordsProportion"] = numStopWords(df)
  averageSentenceLength(df, doc, linguisticFeatures)
  wordFreqMetrics(df,linguisticFeatures)
  compute_AoA_MappingForWordList(df,linguisticFeatures)
  yngve_fraz_score_normalized(pipe(transcription),linguisticFeatures)
  return linguisticFeatures

In [59]:
f = open("english_demo_text.txt","r")
transcription = f.read()
feats = extractLinguisticFeatures(transcription, 'Demo')
print(len(feats))
feats

59


{'POS_TAG:ADJ': 0.022988505747126436,
 'POS_TAG:ADP': 0.13793103448275862,
 'POS_TAG:ADV': 0.022988505747126436,
 'POS_TAG:AUX': 0.04597701149425287,
 'POS_TAG:CONJ': 0.0,
 'POS_TAG:CCONJ': 0.09195402298850575,
 'POS_TAG:DET': 0.11494252873563218,
 'POS_TAG:INTJ': 0.011494252873563218,
 'POS_TAG:NOUN': 0.27586206896551724,
 'POS_TAG:NUM': 0.011494252873563218,
 'POS_TAG:PART': 0.011494252873563218,
 'POS_TAG:PRON': 0.11494252873563218,
 'POS_TAG:PROPN': 0.0,
 'POS_TAG:SCONJ': 0.0,
 'POS_TAG:VERB': 0.13793103448275862,
 'Noun to Verb ratio': 2.0,
 'GrammarRelation:punct': 0.0,
 'GrammarRelation:nsubj': 0.05747126436781609,
 'GrammarRelation:nmod': 0.0,
 'GrammarRelation:neg': 0.0,
 'GrammarRelation:nsubjpass': 0.0,
 'GrammarRelation:conj': 0.0,
 'GrammarRelation:det': 0.11494252873563218,
 'GrammarRelation:dobj': 0.011494252873563218,
 'GrammarRelation:dep': 0.0,
 'GrammarRelation:ccomp': 0.0,
 'GrammarRelation:advmod': 0.022988505747126436,
 'GrammarRelation:ROOT': 0.13793103448275862,

In [60]:
!unzip EnglishTestTranscripts.zip

Archive:  EnglishTestTranscripts.zip
  inflating: EnglishTestTranscripts/trimmed_taukdial-010-1.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-010-2.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-010-3.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-012-1.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-012-2.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-012-3.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-013-1.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-013-2.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-013-3.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-016-1.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-016-2.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-016-3.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-017-1.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-017-2.txt  
  inflating: EnglishTestTranscripts/trimmed_taukdial-

In [61]:
import os
inputDirectory = "EnglishTestTranscripts/"
fileAddresses = sorted(os.listdir(inputDirectory))
linguisticFeats = []
for file in fileAddresses:
  sampleID = file.split('_')[-1].split('.')[0]
  print("Working on", sampleID)
  f = open(inputDirectory+file,"r")
  transcription = f.read()
  fileLinguisticFeats = extractLinguisticFeatures(transcription, sampleID)
  linguisticFeats.append(fileLinguisticFeats)

Working on taukdial-010-1
Working on taukdial-010-2
Working on taukdial-010-3
Working on taukdial-012-1
Working on taukdial-012-2
Working on taukdial-012-3
Working on taukdial-013-1
Working on taukdial-013-2
Working on taukdial-013-3
Working on taukdial-016-1
Working on taukdial-016-2
Working on taukdial-016-3
Working on taukdial-017-1
Working on taukdial-017-2
Working on taukdial-017-3
Working on taukdial-029-1
Working on taukdial-029-2
Working on taukdial-029-3
Working on taukdial-043-1
Working on taukdial-043-2
Working on taukdial-043-3
Working on taukdial-058-1
Working on taukdial-058-2
Working on taukdial-058-3
Working on taukdial-059-1
Working on taukdial-059-2
Working on taukdial-059-3
Working on taukdial-075-1
Working on taukdial-075-2
Working on taukdial-075-3
Working on taukdial-084-1
Working on taukdial-084-2
Working on taukdial-084-3
Working on taukdial-087-1
Working on taukdial-087-2
Working on taukdial-087-3
Working on taukdial-119-1
Working on taukdial-119-2
Working on t

In [62]:
fileAddresses = sorted(os.listdir(inputDirectory))
sampleNames = [file.split('_')[-1].split('.')[0] for file in fileAddresses]
sampleNames

['taukdial-010-1',
 'taukdial-010-2',
 'taukdial-010-3',
 'taukdial-012-1',
 'taukdial-012-2',
 'taukdial-012-3',
 'taukdial-013-1',
 'taukdial-013-2',
 'taukdial-013-3',
 'taukdial-016-1',
 'taukdial-016-2',
 'taukdial-016-3',
 'taukdial-017-1',
 'taukdial-017-2',
 'taukdial-017-3',
 'taukdial-029-1',
 'taukdial-029-2',
 'taukdial-029-3',
 'taukdial-043-1',
 'taukdial-043-2',
 'taukdial-043-3',
 'taukdial-058-1',
 'taukdial-058-2',
 'taukdial-058-3',
 'taukdial-059-1',
 'taukdial-059-2',
 'taukdial-059-3',
 'taukdial-075-1',
 'taukdial-075-2',
 'taukdial-075-3',
 'taukdial-084-1',
 'taukdial-084-2',
 'taukdial-084-3',
 'taukdial-087-1',
 'taukdial-087-2',
 'taukdial-087-3',
 'taukdial-119-1',
 'taukdial-119-2',
 'taukdial-119-3',
 'taukdial-123-1',
 'taukdial-123-2',
 'taukdial-123-3',
 'taukdial-135-1',
 'taukdial-135-2',
 'taukdial-135-3',
 'taukdial-146-1',
 'taukdial-146-2',
 'taukdial-146-3',
 'taukdial-147-1',
 'taukdial-147-2',
 'taukdial-147-3',
 'taukdial-152-1',
 'taukdial-1

In [63]:
# Convert list of dictionaries to DataFrame
df = pd.DataFrame(linguisticFeats)
df.insert(0, 'SampleID', sampleNames)
# Display the DataFrame
df.head()

Unnamed: 0,SampleID,POS_TAG:ADJ,POS_TAG:ADP,POS_TAG:ADV,POS_TAG:AUX,POS_TAG:CONJ,POS_TAG:CCONJ,POS_TAG:DET,POS_TAG:INTJ,POS_TAG:NOUN,...,# of words,Average sentence length,Word Frequency Mean,Word Frequency Standard Deviation,Word Contextual Diversity Mean,Word Contextual Diversity Standard Deviation,Age of Acquistion Mean Average,Age of Acquistion SD Average,Yngve,Frazier
0,taukdial-010-1,0.092857,0.071429,0.064286,0.092857,0.0,0.092857,0.128571,0.007143,0.192857,...,140,70.0,4.182455,0.74032,3.590402,0.431976,4.346604,1.83566,4.007143,1.942857
1,taukdial-010-2,0.025316,0.113924,0.012658,0.037975,0.0,0.101266,0.164557,0.0,0.21519,...,79,26.333333,3.563613,0.855218,3.192142,0.645295,4.218182,1.747273,2.696203,1.797468
2,taukdial-010-3,0.046875,0.125,0.046875,0.078125,0.0,0.09375,0.078125,0.015625,0.171875,...,64,12.8,3.927846,0.94072,3.401054,0.560403,4.482778,1.879444,1.71875,2.101562
3,taukdial-012-1,0.063265,0.093878,0.07551,0.136735,0.0,0.061224,0.1,0.0,0.159184,...,490,10.425532,4.110599,0.863024,3.525867,0.51588,4.790613,1.855031,1.555102,1.479592
4,taukdial-012-2,0.062201,0.052632,0.08134,0.119617,0.0,0.038278,0.129187,0.0,0.133971,...,209,11.611111,4.107627,0.791883,3.519467,0.456063,4.629506,1.890741,1.435407,1.808612


In [64]:
from google.colab import files
df.to_csv("test_taukadialEnglish_AllLinguisticFeatures.csv",index=None)
files.download("test_taukadialEnglish_AllLinguisticFeatures.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#blabla feats

In [None]:
import stanza

# Download the English model for Stanza
stanza.download('en')

# Initialize the pipeline
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse', use_gpu=False)

# Process text and extract features
def extract_features(text):
    doc = nlp(text)

    inflected_verbs = 0
    auxiliary_verbs = 0
    gerund_verbs = 0
    participles = 0
    clauses = 0
    noun_phrases = 0
    verb_phrases = 0
    infinitive_phrases = 0
    prepositional_phrases = 0
    dependent_clauses = 0
    discourse_markers = 0

    for sent in doc.sentences:
        # Count verbs and their types
        for word in sent.words:
            if 'VB' in word.upos:  # VB: Verb, base form
                inflected_verbs += 1
            elif 'VBD' in word.upos or 'VBN' in word.upos:  # VBD: Verb, past tense, VBN: Verb, past participle
                participles += 1
            elif 'VBG' in word.upos:  # VBG: Verb, gerund or present participle
                gerund_verbs += 1
            elif 'MD' in word.upos:  # MD: Modal auxiliary
                auxiliary_verbs += 1

            # Count discourse markers
            if word.deprel == 'discourse':
                discourse_markers += 1

        # Count clauses
        clauses += len(sent.dependencies)

        # Count noun phrases, verb phrases, infinitive phrases, prepositional phrases, and dependent clauses
        for token in sent.tokens:
            if token.misc is not None:
                if 'NP' in token.misc:
                    noun_phrases += 1
                elif 'VP' in token.misc:
                    verb_phrases += 1
                elif 'VBG' in token.misc:  # VB or VBG might indicate infinitive phrase
                    infinitive_phrases += 1
                elif 'PP' in token.misc:
                    prepositional_phrases += 1
                elif 'SBAR' in token.misc:
                    dependent_clauses += 1

    total_tokens = sum(len(sent.words) for sent in doc.sentences)
    clause_rate = clauses / total_tokens
    noun_phrase_rate = noun_phrases / total_tokens
    verb_phrase_rate = verb_phrases / total_tokens
    infinitive_phrase_rate = infinitive_phrases / total_tokens
    prepositional_phrase_rate = prepositional_phrases / total_tokens
    dependent_clause_rate = dependent_clauses / total_tokens
    discourse_marker_rate = discourse_markers / total_tokens

    features = {
        'Proportion of inflected verbs': inflected_verbs / total_tokens,
        'Proportion of auxiliary verbs': auxiliary_verbs / total_tokens,
        'Proportion of gerund verbs': gerund_verbs / total_tokens,
        'Proportion of participles': participles / total_tokens,
        'Number of clauses': clauses,
        'Clause rate': clause_rate,
        'Noun phrase rate': noun_phrase_rate,
        'Verb phrase rate': verb_phrase_rate,
        'Infinitive phrase rate': infinitive_phrase_rate,
        'Prepositional phrase rate': prepositional_phrase_rate,
        'Dependent clause rate': dependent_clause_rate,
        'Discourse marker rate': discourse_marker_rate
    }

    return features

f = open("english_demo_text.txt","r")
text = f.read()

# Extract features
features = extract_features(text)

# Print features
for feature, value in features.items():
    print(f'{feature}: {value}')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources.
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Proportion of inflected verbs: 0.0
Proportion of auxiliary verbs: 0.0
Proportion of gerund verbs: 0.0
Proportion of participles: 0.0
Number of clauses: 101
Clause rate: 1.0
Noun phrase rate: 0.0
Verb phrase rate: 0.0
Infinitive phrase rate: 0.0
Prepositional phrase rate: 0.0
Dependent clause rate: 0.0
Discourse marker rate: 0.009900990099009901
