In [69]:
import pandas as pd
import numpy as np
import nltk
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

survey_data = pd.read_excel('data/raw_data.xlsx')


def clean_data(data):
    # This function takes an array of strings and returns an array of cleaned up strings
    cleaned_data = []
    for row,texts in enumerate(data):
        texts = str(texts)
        texts = texts.lower()
        # remove special characters
        texts = texts.replace(r"(http|@)\S+", "")
        texts = texts.replace(r"::", " ")
        texts = texts.replace(r"’", "")
        texts = texts.replace(r",", " ")
        texts = texts.replace(r"[^a-z\':_]", " ")
        # remove repetition
        #pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
        #texts = texts.replace(pattern, r"\1")
        # Transform short negation form
        texts = texts.replace(r"(can't|cannot)", 'can not')
        texts = texts.replace(r"n't", ' not')
        # Remove stop words
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.remove('not')
        stopwords.remove('nor')
        stopwords.remove('no')
        cleaned_line = ''
        for word in texts.split(" "):
            if word not in stopwords:
                cleaned_line = cleaned_line + " " + word
        cleaned_data.append(cleaned_line)
    return cleaned_data

columns_with_open_responses = ['Q02', 'Q04', 'Q06', 'Q08', 'Q10', 'Q12', 'Q14', 'Q16', 'Q18']
data = survey_data[columns_with_open_responses]
data = data.reset_index()
data = data[1:]
data = data.drop(columns = ['index'])
all_values = []
for column in data:
    this_column_values = data[column].tolist()
    all_values += this_column_values
one_column_df = pd.DataFrame(all_values)
one_column_df = one_column_df.dropna()
data = one_column_df.reset_index()
data = data.drop(columns = ['index'])
data = np.array(data)
data = clean_data(data)

In [70]:
data

[' ["i not provided info yet"]',
 " ['not able access ato site complete basic reporting  pathetic waste time.repeatedly denied access shut   .. truly waste time utter garbag!!! someone fet paid  taxpayers money put crap together joke  think people hours waste trying sort shit out??!! stipped sending bas forms? dont want paet lame malfunctioning computer system ! regulwr paper bas forms remove worthless shit computer system!!e!!']",
 ' ["the operator great i\'d code pkr040"]',
 " ['as initiatives  concerns not handle technology well ']",
 " ['cant login ']",
 " ['information requests keep piling due date!']",
 " ['i writing behalf partner parkinsons disease. according ndis must access centrelink himself. neurological problems make difficult. frozen account three times. trying open another new account  site wont allow this. easier way/']",
 " ['seems except layout first glance']",
 " ['im trying create account months always come back wrong attempts im lock ']",
 " ['due fact gov login pa

In [81]:
def clean_up(data):
    return_data = []
    for line in data:
        new_line = ""
        for word in line.split(" "):
            if "\n" in word:
                new_word = word.replace("\n"," ")
            else:
                new_word = word
            new_line = new_line + " " + new_word
        return_data.append(new_line)
    return return_data

In [98]:
import nltk

text = clean_up(data)
all_string = ''
for sentences in text:
    sentences = sentences.replace("[","")
    sentences = sentences.replace("]","")
    sentences = sentences.replace("\'","")
    sentences = sentences.replace("!","")
    sentences = sentences.replace("\\","")
    all_string = all_string + sentences

text

['  ["i not provided info yet"]',
 "  ['not able access ato site complete basic reporting  pathetic waste time.repeatedly denied access shut   .. truly waste time utter garbag!!! someone fet paid  taxpayers money put crap together joke  think people hours waste trying sort shit out??!! stipped sending bas forms? dont want paet lame malfunctioning computer system ! regulwr paper bas forms remove worthless shit computer system!!e!!']",
 '  ["the operator great i\'d code pkr040"]',
 "  ['as initiatives  concerns not handle technology well ']",
 "  ['cant login ']",
 "  ['information requests keep piling due date!']",
 "  ['i writing behalf partner parkinsons disease. according ndis must access centrelink himself. neurological problems make difficult. frozen account three times. trying open another new account  site wont allow this. easier way/']",
 "  ['seems except layout first glance']",
 "  ['im trying create account months always come back wrong attempts im lock ']",
 "  ['due fact go

In [92]:
stopwords = nltk.corpus.stopwords.words("english")

words = [w.lower() for w in nltk.word_tokenize(all_string) if w.lower().isalpha() if w.lower() not in stopwords]

#words: list[str] = nltk.word_tokenize(all_string)

# frequency distribution
fd = nltk.FreqDist(words)
df = pd.DataFrame(list(fd.items()), columns = ["Word","Frequency"]) 
df.to_csv('data/output/frequency_distribution.csv')
fd

FreqDist({'mygov': 175, 'information': 172, 'get': 149, 'use': 130, 'account': 116, 'access': 109, 'help': 108, 'login': 98, 'site': 88, 'could': 86, ...})

In [93]:
# Trigram finder
finder = nltk.collocations.TrigramCollocationFinder.from_words(words)

finder.ngram_fd.most_common(5)
finder.ngram_fd.tabulate(5)

('information', 'curious', 'website')          ('code', 'generator', 'app')        ('name', 'frank', 'qualtrics')    ('frank', 'qualtrics', 'canberra')   ('qualtrics', 'canberra', 'please') 
                                    8                                     8                                     8                                     8                                     8 


In [94]:
from nltk.sentiment import SentimentIntensityAnalyzer
from statistics import *
sia = SentimentIntensityAnalyzer()

def extract_features(text):
    features = dict()
    compound_scores = list()
    positive_scores = list()
    negative_scores = list()

    for sentence in nltk.sent_tokenize(text):
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])
        negative_scores.append(sia.polarity_scores(sentence)["neg"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores)
    features["mean_positive"] = mean(positive_scores)
    features["mean_negative"] = mean(negative_scores)
    return features
features = extract_features(all_string)

In [100]:
# Calling the polarity_scores method on sid and passing in the message_text outputs a dictionary with negative, neutral, positive, and compound scores for the input text
scores = sia.polarity_scores(all_string)
scores

{'neg': 0.109, 'neu': 0.709, 'pos': 0.182, 'compound': 1.0}

In [96]:
all_scores = (features.update(scores))
df = pd.DataFrame(list(features.items()), columns = ["Metric","Score"]) 
df.to_csv('data/output/vader_polarity_scores.csv')

In [97]:
df

Unnamed: 0,Metric,Score
0,mean_compound,0.033472
1,mean_positive,0.128755
2,mean_negative,0.125654
3,neg,0.109
4,neu,0.709
5,pos,0.182
6,compound,1.0
