In [1]:
# nltk.download()

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud, STOPWORDS

### Plot year

In [12]:
#function to display values on barcharts
def show_values(axs, orient="v", space=.01):
    def _single(ax):
        if orient == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height() + (p.get_height()*0.01)
                value = '{:.0f}'.format(p.get_height(), fontsize=14)
                ax.text(_x, _y, value, ha="center", fontsize=12) 
        elif orient == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height() - (p.get_height()*0.5)
                value = '{:.0f}'.format(p.get_width(), fontsize=14)
                ax.text(_x, _y, value, ha="left", fontsize=12)

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _single(ax)
    else:
        _single(axs)

## II. Feature Engineering

- Word count 

- Character count 

- Sentence count 

- Key word / unique word count 

- Cybersecurity word count 

- Numerical terms 

- Average word length 

- Average sentence length 


In [None]:
# Count of tokens ie. sequence of characters
len(df['Statement'][0])

### 1. Word count

In [None]:
df.head()

In [None]:
df['WordCount'] = df['Statement'].apply(lambda x: len(x.split()))

In [None]:
# Average word length
df['WordCount'].mean()

In [None]:
df['Statement'][0]

In [None]:
# Character count???
def character_count(text):
    character_count = 0
    word_list = text.split()
    for word in word_list:
        character_count += len(word)
    return character_count

In [None]:
saying = ['After', 'all', 'is', 'said', 'and', 'done',
        'more', 'is', 'said', 'than', 'done']
tokens = set(saying)
print(tokens)

tokens = sorted(tokens)
tokens[-2:]

### Text Preprocessing

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess(text):
    # split into sentences
    sentences = sent_tokenize(text)
    
    # split into words
    tokens = word_tokenize(text)
    
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in words]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [None]:
df['CleanStatement'] = df['Statement'].apply(lambda x: preprocess(x))

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df[df['NotificationDate'].isnull()]

### Get sentiment using SiEBERT - English-Language Sentiment Classification

In [None]:
# Ref: https://huggingface.co/siebert/sentiment-roberta-large-english

from transformers import pipeline
sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
# print(sentiment_analysis("I love this!"))

In [None]:
# Create a function to get sentiment
def get_sentiment(text):
    sentences = sent_tokenize(text)

    sent_dict = {'Positive': 0, 'Negative': 0}

    for sentence in sentences:
        # Get sentiment for each sentence in the statement
        sentiment = sentiment_analysis(sentence)   
        if sentiment[0]['label'] == 'NEGATIVE':
            sent_dict['Negative'] += 1
        else: 
            sent_dict['Positive'] += 1
    # Check Positive or Negative has higher values
    if sent_dict['Positive'] > sent_dict['Negative']:
        return 1
    else:
        return 0

In [None]:
df['Siebert_Sentiment'] = df['Statement'].apply(lambda x: get_sentiment(x))

In [None]:
df['Siebert_Sentiment'].value_counts(normalize=True)

In [None]:
df.head()

In [None]:
value_counts = df['Siebert_Sentiment'].value_counts()

# Create bar plot
plt.figure(figsize=(6, 4))
p = sns.barplot(x=value_counts.index, y=value_counts.values, color='#035397')
show_values(p)
plt.xlabel('Sentiment')
plt.ylabel('Frequency')
plt.title('Sentiment Analysis of Statements')
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()


In [None]:
df['Length'].max()

### Import databreaches Excel file

In [None]:
# Import dataframe with Description split into 2 columns: DescriptionClean (text only - need more cleaning) & WebsiteLink

columns_to_read = ['Name', 'Day', 'Month', 'Year', 'NotificationDate', 'WebpageTitle', 'Author',
                   'DetailedExplanation', 'Whitewashing', 'Apology', 'Compensation',
                  'ResponsiveAction', 'ValueCommitment', 'CustomerFocus', 'OpenDisclosure', 'CustomerAdvice']
df_st = pd.read_csv('ausdatabreach2018-23v1.csv', usecols=columns_to_read, encoding='latin1')
df_st.info()

In [None]:
df_st.head()

In [None]:
df_st.tail()

In [None]:
df['Name'].equals(df_st['Name'])

In [None]:
comparison = df['Name'] != df_st['Name']

# Get the indices of the differing rows
differing_rows = comparison[comparison].index

# Print the differing rows
print(differing_rows)

In [None]:
df_st['Name'].iloc[39:]

In [None]:
df['Name'].iloc[39:]

### EDA

In [None]:
df = df_st

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
import seaborn as sns

# Calculate value counts
value_counts = df['Year'].value_counts()

# Create bar plot
p = sns.barplot(x=value_counts.index, y=value_counts.values, color='#035397')
show_values(p)
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Number of breaches by year')
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()

### Percentages of each characteristics

In [None]:
df['DetailedExplanation'].value_counts(normalize=True)[1]*100

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the figure and axes
fig, ax = plt.subplots()

boolean_col = ['DetailedExplanation', 'Whitewashing', 'Apology', 
               'Compensation', 'ResponsiveAction', 'ValueCommitment',
                'CustomerFocus', 'OpenDisclosure', 'CustomerAdvice']
sns.countplot(data=df, x="DetailedExplanation")
plt.show()

In [None]:
boolean_col = ['DetailedExplanation', 'Whitewashing', 'Apology', 
               'Compensation', 'ResponsiveAction', 'ValueCommitment',
                'CustomerFocus', 'OpenDisclosure', 'CustomerAdvice']

values = []
for col in boolean_col:
    pct = df[col].value_counts(normalize=True)[1]*100
    values.append(pct)

In [None]:
# Create bar plot

plt.figure(figsize=(10,7))
p = sns.barplot(x=boolean_col, y=values, color='#035397')
show_values(p)
plt.xlabel('Characteristics')
plt.ylabel('Percentages')
plt.title('Percentages of Characteristics')
plt.xticks(fontsize=7)
plt.yticks(fontsize=5)
plt.show()

### Response Grouping

In [None]:
# FullTransparency
def type1(row):
    
    if (row['DetailedExplanation'] == 1) & (row['Whitewashing'] == 0) & (row['ResponsiveAction'] == 1) & (row['ValueCommitment'] == 1)  & (row['OpenDisclosure'] == 1):
        return 1
    else:
        return 0

# Apply the custom function to create a new column
df['FullTransparency'] = df.apply(type1, axis=1)

In [None]:
df[df['FullTransparency']==1].iloc[:,7:-1]

In [None]:
# Guarded
def type2(row):
    
    if (row['DetailedExplanation'] == 0) & (row['Whitewashing'] == 1) & (row['ResponsiveAction'] == 1) & (row['ValueCommitment'] == 1)  & (row['OpenDisclosure'] == 0):
        return 1
    else:
        return 0

# Apply the custom function to create a new column
df['Guarded'] = df.apply(type2, axis=1)

In [None]:
# Opacity
def type3(row):
    
    if (row['DetailedExplanation'] == 0) & (row['Whitewashing'] == 1) & (row['ResponsiveAction'] == 1) & (row['ValueCommitment'] == 0)  & (row['OpenDisclosure'] == 0):
        return 1
    else:
        return 0

# Apply the custom function to create a new column
df['Opacity'] = df.apply(type3, axis=1)

In [None]:
# CustomerInterest
def type4(row):
    
    if (row['Apology'] == 1) & (row['Compensation'] == 1) & (row['CustomerFocus'] == 1) & (row['CustomerAdvice'] == 1):
        return 1
    else:
        return 0

# Apply the custom function to create a new column
df['CustomerInterest'] = df.apply(type4, axis=1)

In [None]:
# CustomerInterest
def type5(row):
    
    if (row['Apology'] == 1) & (row['Compensation'] == 0):
        return 1
    else:
        return 0

# Apply the custom function to create a new column
df['BalancedInterest'] = df.apply(type5, axis=1)

In [None]:
# CustomerInterest
def type6(row):
    
    if (row['Apology'] == 0) & (row['Compensation'] == 0) & (row['CustomerFocus'] == 0) & (row['CustomerAdvice'] == 0):
        return 1
    else:
        return 0

# Apply the custom function to create a new column
df['CompanyInterest'] = df.apply(type6, axis=1)

In [None]:
df.iloc[:5,7:]

### Percentages of each response types

In [None]:
response_col = ['FullTransparency', 'Guarded', 'Opacity', 
               'CustomerInterest', 'BalancedInterest', 'CompanyInterest']

values_pct = []
for col in response_col:
    # If there is only one value 
    if len(df[col].value_counts(normalize=True)) == 1:
        values_pct.append(0)
        
    else:
        pct = df[col].value_counts(normalize=True)[1]*100
        
        values_pct.append(pct)
    
# Create bar plot

plt.figure(figsize=(7,5))
p = sns.barplot(x=response_col, y=values_pct, color='#035397')
show_values(p)
plt.xlabel('Response Types')
plt.ylabel('Percentages')
plt.title('Percentages of Response Types')
plt.xticks(fontsize=5)
plt.yticks(fontsize=5)
plt.show()

In [None]:
df['BalancedInterest'].value_counts(normalize=True)[1]*100

### Bigrams for titles

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Ref: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

# Lemmatize with POS Tag
from nltk.corpus import wordnet
# nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
def preprocess(text):
    try: 
        # split into sentences
        sentences = sent_tokenize(text)

        # split into words
        tokens = word_tokenize(text)

        # convert to lower case
        tokens = [w.lower() for w in tokens]

        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]

        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]

        # filter out stop words
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]

        # Lemmatise the tokens
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]

        # Join the tokens back into a string
        processed_text = ' '.join(lemmatized_tokens)
        return processed_text
    except TypeError:
        return ""

In [None]:
df['Title_nltk'] = df['WebpageTitle'].apply(lambda x: preprocess(x))

In [None]:
df['Title_nltk'].head()

In [None]:
from nltk.util import ngrams

def bigrams_convert(column, n=2):
    df['bigrams'+'_'+column]=df[column].apply(lambda sentence: list(ngrams(sentence.split(), n)))
    
def trigrams_convert(column, n=3):
    df['trigrams'+'_'+column]=df[column].apply(lambda sentence: list(ngrams(sentence.split(), n)))

In [None]:
bigrams_convert('Title_nltk')
trigrams_convert('Title_nltk')

In [None]:
df.head()

In [None]:
from collections import Counter

bigrams = []
for row in df['bigrams_Title_nltk']:
    bigrams.extend(row)
    
print(f'Count of bigrams: {len(bigrams)}')
print("\n")
    
# Count the frequency of each bigram
bigram_frequency = Counter(bigrams)

# Print the frequency of each bigram
# for bigram, frequency in bigram_frequency.items():
#     print(bigram, frequency)
    
# Organize elements by frequency using most_common()
bigrams_organized_by_frequency = bigram_frequency.most_common()

# Print the elements organized by frequency
for element, frequency in bigrams_organized_by_frequency:
    print(element, frequency)

In [None]:
import seaborn as sns

bigrams_organized_by_frequency_10 = bigram_frequency.most_common(5)

# Print the elements organized by frequency
# print('Top 10 trigrams')
bi = []
freq = []

for element, frequency in bigrams_organized_by_frequency_10:
    bigram = ' '.join(element)
    print(f'Bigram: {bigram} - Frequency: {frequency}')
    bi.append(bigram)
    freq.append(frequency)


df_bi = pd.DataFrame({'Bigram': bi, 'Count': freq})

plt.figure(figsize=(7, 5))
ax = sns.barplot(x=df_bi['Bigram'].values, y=df_bi['Count'].values)

# Display the count on each bar
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 5), textcoords = 'offset points')

plt.ylabel('Count')
plt.title('Top 5 Most Frequently Occuring Bigrams')

plt.xticks(rotation=70)
plt.tight_layout()
plt.show()

In [None]:
trigrams = []
for row in df['trigrams_Title_nltk']:
    trigrams.extend(row)
    
print(f'Count of trigrams: {len(trigrams)}')
print("\n")
    
# Count the frequency of each trigram
trigram_frequency = Counter(trigrams)

# Organize elements by frequency using most_common()
trigrams_organized_by_frequency = trigram_frequency.most_common()

# Print the elements organized by frequency
for element, frequency in trigrams_organized_by_frequency:
    print(element, frequency)

In [None]:
trigrams_organized_by_frequency_10 = trigram_frequency.most_common(5)

# Print the elements organized by frequency
# print('Top 10 trigrams')
tri = []
freq = []

for element, frequency in trigrams_organized_by_frequency_10:
    trigram = ' '.join(element)
#     print(f'Trigram: {trigram} - Frequency: {frequency}')
    tri.append(trigram)
    freq.append(frequency)
    
df_tri = pd.DataFrame({'Trigram': tri, 'Count': freq})

plt.figure(figsize=(7, 5))
ax = sns.barplot(x=df_tri['Trigram'].values, y=df_tri['Count'].values)

# Display the count on each bar
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 5), textcoords = 'offset points')

plt.ylabel('Count')
plt.title('Top 5 Most Frequently Occuring Trigrams')

plt.xticks(rotation=70)
plt.tight_layout()
plt.show()

In [None]:
# Calculate value counts
value_counts = df['Author'].value_counts(dropna=False)

# Create bar plot
plt.figure(figsize=(9, 5))
p = sns.barplot(x=value_counts.index, y=value_counts.values, color='#035397')
show_values(p)
plt.xlabel('Author')
plt.ylabel('Frequency')
plt.title('Authors of Statements')
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()

In [None]:
df['Author'].value_counts(dropna=False)

In [None]:
# df.to_excel('output.xlsx', index=False)

In [None]:
# df.drop(['CleanStatement2'], axis=1, inplace=True)

In [None]:
# initialize NLTK sentiment analyzer
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# analyzer = SentimentIntensityAnalyzer()

# # create get_sentiment function
# def get_sentiment(text):
#     scores = analyzer.polarity_scores(text)
#     sentiment = 1 if scores['pos'] > 0 else 0
#     return sentiment