In [1]:
import csv
import pandas as pd
import spacy
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

In [2]:
import json

In [3]:
nlp = spacy.load('en_core_web_sm')
stops = stopwords.words('english')
stops.extend(["i'm", "he's", "i've", "i'll"])

In [4]:
def parse(rows):
    for row in rows:
        doc = nlp(row['Text'])
        row['tokens'] = []
        for token in doc:
            if token.pos_ != 'PUNCT' and token.is_stop is False:
                tokens = dict()
                tokens['word'] = token.text
                tokens['pos'] = token.pos_
                tokens['lemma'] = token.lemma_
                row['tokens'].append(tokens)
    return rows

In [5]:
def pos(rows):
    rows = parse(rows)
    for row in rows:
        pos_distribution = dict()
        for token in row['tokens']:
            if token['pos'] in pos_distribution.keys():
                pos_distribution[token['pos']] += 1
            else:
                pos_distribution[token['pos']] = 1
        row['pos'] = pos_distribution
    return rows

In [6]:
def phrase_len(rows):
    for row in rows:
        phrase = row['Text']
        words = [w.lower() for w in word_tokenize(phrase) if w.isalpha()]
        row['len'] = len(words)
    return rows

In [7]:
def word_order(sent):
    d = nlp(sent)
    roots = [token for token in d if token.head == token]
    orders = []
    for root in roots:
        d = {}
        d['V'] = root.i
        for child in root.children:
            if child.dep_ == 'nsubj':
                d['S'] = child.i
            if child.dep_== 'dobj':
                d['O'] = child.i
        listt = sorted(d.keys(),key=d.get)
        order = ''.join(listt)
        orders.append(order)
    return orders

In [8]:
def quotes_order(rows):
    for row in rows:
        row['word_order'] = word_order(row['Text'])
    return rows

In [9]:
punct_list = '!"«»“”#$%&\–-–—()*+,./\:;<=>?@[]^_`{|}~1234567890'

def clean(text):
    text = text.lower()
    for char in text:
        if char in punct_list:
            text = text.replace(char, '')
    return text

In [18]:
# функция находящая полезные слова
def extract_words(text):
    words = clean(text).split()
    good_words = []
    for word in words:
        if word not in stops:
            good_words.append(word) 
    return good_words

In [10]:
def popular_words(rows):
    txt_ps = []
    txt_pe = []
    txt_np = []
    
    populars = {'pricesses':{}, 'princes':{}, 'non-p':{}}

    for row in rows:
        # принцессы
        if row['Speaker_Status'] == 'PRINCESS':
            txt_ps.extend(extract_words(row['Text']))
            
        freq_ps = Counter(txt_ps)
        freq_psSorted = sorted(freq_ps.items(), key = lambda x: x[1], reverse=True)
        for i in freq_psSorted[:50]:
            populars['pricesses'][i[0]] = freq_ps[i[0]]

        # принцы
        if row['Speaker_Status'] == 'PRINCE':
            txt_pe.extend(extract_words(row['Text']))
            
            freq_pe = Counter(txt_pe)
            freq_peSorted = sorted(freq_pe.items(), key = lambda x: x[1], reverse=True)
            for i in freq_peSorted[:50]:
                populars['princes'][i[0]] = freq_ps[i[0]]

        # челядь
        if row['Speaker_Status'] == 'NON-P':
            txt_np.extend(extract_words(row['Text']))
            
            freq_np = Counter(txt_np)
            freq_npSorted = sorted(freq_np.items(), key = lambda x: x[1], reverse=True)
            for i in freq_npSorted[:50]:
                populars['non-p'][i[0]] = freq_ps[i[0]]
                
    return populars

In [12]:
rows = []

with open('princess_corpus.csv', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        row = dict(row)
        rows.append(row)

In [None]:
data = pos(parse(rows))

In [None]:
data = phrase_len(rows)

In [None]:
data = quotes_order(rows)

In [None]:
with open('new_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

In [19]:
populars = popular_words(rows)

In [21]:
with open('popular_words.json', 'w', encoding='utf-8') as f:
    json.dump(populars, f, indent=2, ensure_ascii=False)

In [20]:
print(populars)

{'pricesses': {'wishing': 6, 'find': 25, 'ah': 4, 'today': 3, 'ahahahahahh': 4, 'well': 55, 'wish': 17, 'one': 51, 'love': 48, 'hoping': 2, 'nice': 2, 'things': 7, "he'll": 3, 'say': 34, 'want': 52, 'know': 104, 'secret': 1, 'promise': 1, 'tell': 28, 'standing': 1, 'make': 3, "that's": 39, 'hear': 1, 'echoing': 1, 'soon': 1, 'come': 74, 'true': 27, 'dreaming': 1, 'ahh': 1, 'oh': 188, 'song': 9, 'mama': 2, 'papa': 21, 'goodbye': 16, 'hello': 2, "what's": 14, 'matter': 4, "where's": 1, 'believe': 1, 'lost': 1, 'please': 68, 'cry': 1, 'perk': 1, 'smile': 5, 'better': 4, "can't": 57, 'far': 1, 'fly': 1, 'understand': 13, 'butbut': 1, 'away': 30, 'sing': 18, 'life': 23, 'world': 15, 'along': 4, 'like': 72, 'sure': 10, 'sleep': 4, "there's": 9, 'little': 39, 'seven': 4, 'children': 4, 'look': 35, 'tsk': 12, 'would': 37, 'maybe': 16, 'stay': 7, 'house': 3, 'mother': 25, 'use': 2, 'fireplace': 2, 'broom': 2, 'room': 2, 'clean': 2, 'happy': 3, 'adorable': 2, 'guess': 2, 'sleepy': 3, 'mean': 31,