### Extracting linguistic features using spaCy
Assignment 1, Language analytics, CDS 2024
Laura Givskov Rahbek 

#### Set up and import packacges 

In [2]:
import spacy
import os 
import pandas as pd
import glob
import re;

In [3]:
#nlp = spacy.load("en_core_web_md")
import en_core_web_sm
nlp = en_core_web_sm.load()

#### Define functions

##### Relative frequency of word type in a text

In [4]:
def rel_freq(count, len_doc):
    return round((count/len_doc * 10000), 2)

##### Retreiving number of unique persons, locations and organisations in a text

In [5]:
def unique_NE(doc):
    enteties = []
    for e in doc.ents: 
        enteties.append((e.text, e.label_))
    ents_pd = pd.DataFrame(enteties, columns=["ent", "label"])
    ents_pd = ents_pd.drop_duplicates()
    unique_counts = ents_pd.value_counts(subset = "label")
    
    unique_labels = ['PERSON', 'GPE', 'ORG']
    unique_row = []
    for label in unique_labels:
        if label in (unique_counts.index):
            unique_row.append(unique_counts[label])
        else: b
            unique_row.append(0)

    return unique_row

#### Extract linguistic features for each text

In [6]:
filepath = os.path.join("..", "in", "USEcorpus")

for subfolder in sorted(os.listdir(filepath)):
    subfolder_path = os.path.join(filepath, subfolder)
    
    out_df = pd.DataFrame(columns=("Filename","RelFreq NOUN","RelFreq VERB","RelFreq ADJ","RelFreq ADV","Unique PER","Unique LOC","Unique ORG"))

    outpath = os.path.join("..", "out", f"{subfolder}.csv")

    for file in sorted(glob.glob(os.path.join(subfolder_path, "*.txt"))):

        with open(file, "r", encoding="latin-1") as f:
            text = f.read() #read each text file
            text = re.sub(r'<*?>', '', text) #remov meta data 
            doc = nlp(text) #make text into doc
    
            len_doc = len(doc) #get total token count

            text_name = file.split("/")[-1]

            noun_count, verb_count, adj_count, adv_count = 0, 0, 0, 0
            
            for token in doc:
                if token.pos_ == "NOUN":
                    noun_count += 1
                if token.pos_ == "VERB":
                    verb_count += 1
                if token.pos_ == "ADJ":
                    adj_count +=1
                if token.pos_ == "ADV":
                    adv_count +=1
            
            noun_rel, verb_rel, adj_rel, adv_rel = rel_freq(noun_count, len_doc), rel_freq(verb_count, len_doc), rel_freq(adj_count, len_doc), rel_freq(adv_count, len_doc)
            
            per, loc, org = unique_NE(doc)
    
            file_row = [text_name, noun_rel, verb_rel, adj_rel, adv_rel, per, loc, org]

            out_df.loc[len(out_df)] = file_row

    out_df.to_csv(outpath)                   