In [77]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


In [28]:
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.tokenize import word_tokenize
import pkg_resources
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:
    pass
else:
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# normalization
def func_norm(s):
    """
    Perform some basic normalisation operations.

    Parameters
    ----------
    s:  str
        text to operate on
    
    Returns
    -------
        Normalised string
    
    """
    s = s.lower() # lower case
    # letter repetition (>2)
    s  = re.sub(r'([a-z])\1{2,}', r'\1', s)
    # non word repetition
    s = s = re.sub(r'([\W+])\1{1,}', r'\1', s)
    # noise text
    s = re.sub(r' ing ', ' ', s)
    # phrase repetition
    s = re.sub(r'(.{2,}?)\1{1,}', r'\1', s)

    return s.strip()

def func_punc(w_list):
    """
    Remove non-alphabet characters. Includes punctuation.

    Parameters
    ----------
    w_list: list
        list of tokens to be processed
    
    Returns
    -------
        list without non-alphabet characters
    """
    return [word for word in w_list if word.isalpha()]

def func_stopf(w_list):
    """
    Remove stop words

    Parameters
    ----------
    w_list: list
        list of tokens to be processed
    
    Returns
    -------
        list without stop words
    """
    stop_words = set(stopwords.words('english'))
    w_list  = [f for f in w_list if f not in stop_words]
    return w_list

# stemming
pstem = PorterStemmer()

def func_stem(w_list):
    """
    stem word list

    Parameters
    ----------
    w_list: list
        word list for stemming

    Returns
    -------
        stemmed word list 
    """
    sw_list = [pstem.stem(w) for w in w_list]
    return sw_list

# selecting nouns
def func_noun(w_list):
    """
    in: word list to be processed
    out: w_list with only nouns selected
    """
    return [word for (word, pos) in nltk.pos_tag(w_list) if pos[:2] == 'NN']

# spell checker/typo correction
def func_spell(w_list):
    """
    in: word list to be processed
    out: w_list with typo fixed by symspell. words with no match up will be dropped
    """
    w_list_fixed = []
    for word in w_list:
        if 'covid' in word:
            w_list_fixed.append(word)
        else:
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
            if suggestions:
                w_list_fixed.append(suggestions[0].term)
            else:
                pass
    return w_list_fixed

def preprocess_sent(rw):
    """
    Get sentence level preprocessed data from raw texts

    Parameters
    ----------
        rw: str
            sentence to be processed

    Returns
    -------
        sentence level pre-processed text
    """
    s = func_norm(rw)

    return s



def preprocess_word(s):
    """
    Get word level preprocessed data from preprocessed sentences.
    
    Parameters
    ----------
    s:  str
        sentence to be processed
    
    Returns
    -------
        word level pre-processed text
    """
    if not s:
        return None
    w_list = word_tokenize(s)
    w_list = func_punc(w_list)
    w_list = func_noun(w_list)
    w_list = func_spell(w_list)
    w_list = func_stem(w_list)
    w_list = func_stopf(w_list)

    return w_list

def preprocess(docs):
    """
    Preprocess the data.

    Parameters
    ----------
    docs: list
        list of documents to be preprocessed
    
    Returns
    -------
        Preprocessed sentences, tokens
    """
    print('Preprocessing raw texts ...')
    #n_docs = len(docs)
    sentences = []  # sentence level preprocessed
    token_lists = []  # word level preprocessed
    #samp = np.random.choice(n_docs)
    for i in range(0, len(docs)):
        sentence = preprocess_sent(docs.iloc[i])
        token_list = preprocess_word(sentence)
        if token_list:
            sentences.append(sentence)
            token_lists.append(token_list)
        print('{} %'.format(str(np.round((i + 1) / len(writing_df.writing) * 100, 2))), end='\r')
    print('Preprocessing raw texts. Done!')
    return sentences, token_lists

utils

In [176]:
import scikit_posthocs as sp
from scipy import stats
from textblob import TextBlob

def kruskal_wallis_func(in_df, group_col, test_col):
    """
    Kruskal Wallis test and
    post-hoc Dunn's.

    Parameters
    ----------
    in_df:  pd DataFrame
        input dataframe
    group_col:  str
        name of group column
    test_col:   str
        name of column containing
        relevant values

    Returns
    -------
    Statistic, pvalue
    """
    data = in_df.pivot(columns = group_col, values = test_col)
    if len(in_df[group_col].unique())>2:
            statistic,pval = stats.kruskal(data.iloc[:,0],data.iloc[:,1],
            data.iloc[:,2],nan_policy = 'omit')
            posthoc = sp.posthoc_dunn(
                        [data.iloc[:,0].dropna(),data.iloc[:,1].dropna(),data.iloc[:,2].dropna()],
                        p_adjust = 'bonferroni'
                        )
            key = [data.columns[0],data.columns[1],data.columns[2]]
    else:
        statistic,pval = stats.kruskal(data.iloc[:,0],data.iloc[:,1],
        nan_policy = 'omit')
        posthoc = None
        key = None
    return statistic,pval, posthoc, key


def get_sentiment(in_df, in_col):
    """
    Get subjectivity
    and polarity scores.

    Parameters
    ----------
    in_df:  pd DataFrame
        DataFrame to operate on
    in_col: str
        column holding text data
        to operate on
    
    Returns
    -------
    Input DataFrame with
    subjectivity/polarity columns
    added.

    """
    print("Getting sentiment scores...")
    in_df = in_df.assign(
                        polarity = in_df[in_col].astype('str').apply(
                        [lambda x: TextBlob(x).sentiment.polarity]),
                        subjectivity = in_df[in_col].astype('str').apply(
                        [lambda x: TextBlob(x).sentiment.subjectivity])
                        )
    print("Done!")
    return in_df


In [1]:
home = 1
if home:
    infiledir = r"C:\Users\Luzia T\UCL\WorkingFromHome\Possible_online_studies\NLP_expressive_writing\analysis\Processed_2"
    writing_dir = r"C:\Users\Luzia T\UCL\WorkingFromHome\Possible_online_studies\NLP_expressive_writing\analysis\writing_data\statements"
else:
    infiledir = r"P:\EW_analysis\analysis\Processed_2"
    writing_dir = r"P:\EW_analysis\analysis\writing\writing_data"

In [72]:
writing_df = pd.read_csv(os.path.join(writing_dir, 'writing_df.csv'))

In [179]:
sentences,tokens = preprocess(writing_df.writing)
writing_df = writing_df.assign(
                                writing_tokens=tokens,
                                writing_sents = sentences
                                )
writing_df = writing_df.assign(
                                word_count = writing_df.writing_tokens.apply(len)
                                )
writing_df = writing_df.assign(
                                word_count_raw =
                                writing_df.writing.apply(lambda x: len(x.split()))
                                )
writing_df = get_sentiment(writing_df,'writing')
for val in ['word_count','word_count_raw','polarity','subjectivity']:
        # check whether word count is significantly different between conditions:
        _, pval, posthoc, key = kruskal_wallis_func(
                                        writing_df,'Group', val
                                        )
        print(f"\nP value ({val}) is {pval}.")
        if pval<0.05:
                print(f"Conditions differ significantly on {val}.")
                print(f"Posthoc ({val}) is:\n{posthoc}.")
                print(f"The key is 1 = {key[0]}, 2 = {key[1]}, 3 = {key[2]}")
        else:
                print(f"No significant between group differences on {val}.")


P value (word_count) is 0.08764873320117955.
No significant between group differences on word_count.

P value (word_count_raw) is 0.0029349359121382448.
Conditions differ significantly on word_count_raw.
Posthoc (word_count_raw) is:
          1         2         3
1  1.000000  0.083669  0.002315
2  0.083669  1.000000  0.748153
3  0.002315  0.748153  1.000000.
The key is 1 = CTR, 2 = EW, 3 = EWRE

P value (polarity) is 0.03688590519079081.
Conditions differ significantly on polarity.
Posthoc (polarity) is:
          1        2         3
1  1.000000  1.00000  0.124654
2  1.000000  1.00000  0.053490
3  0.124654  0.05349  1.000000.
The key is 1 = CTR, 2 = EW, 3 = EWRE

P value (subjectivity) is 1.3978564535849075e-31.
Conditions differ significantly on subjectivity.
Posthoc (subjectivity) is:
              1             2             3
1  1.000000e+00  1.510864e-25  1.931276e-23
2  1.510864e-25  1.000000e+00  1.000000e+00
3  1.931276e-23  1.000000e+00  1.000000e+00.
The key is 1 = CTR, 2 