In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import re
# only need to do these one time
# nltk.download("punkt")
# nltk.download("stopwords")

In [43]:
with open("preprocessed/social media.txt", "r") as file:
    # returns one string with all of the text
    raw_text = file.read()
file.close()
# now we need to do some pre-processing (more can be done after tokenizing)

# this removes any instance of [<number>] as some papers use this as a way of citing
clean_text = re.sub(r"\[\d+\]", "", raw_text)

# removes any instance of an in-text citation following any of these formats:
# (Smith & Johnson, 2019), (Smith, 2019), (Smith et al., 2019), (Smith & Johnson, 2019; James, 2019)

clean_text = re.sub(r"\s\((?:(?:[\w \.&]+\, )+[0-9]{4}[;|:]*\s*)+\)", "", clean_text)

# need to remove instances of citations within sentences (e.g "Smith et al. (2018) said that....")
# as these cause the sentences to get split up where they aren't supposed to
clean_text = re.sub("(et al\.)", "et al", clean_text)
clean_text = re.sub("(e\.g\.)", "e.g", clean_text)
clean_text = re.sub("(i\.e\.)", "i.e", clean_text)
clean_text = re.sub("(etc\.)", "etc", clean_text)
clean_text = re.sub("(Fig\.)", "Fig", clean_text)
clean_text = re.sub("(Table \w+)", "Table", clean_text)


# also need to remove numbers, includes decimals
clean_text = re.sub(r"\d+(\.[0-9]+)*", "", clean_text)
clean_text = re.sub(r"{.+}@\S+", "", clean_text)


# removes any URLs
#clean_text = re.sub(r"http\S+", "", clean_text)
clean_text = re.sub(r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#“”"-]*[\w@?^=%&\/~+“”"#-])', " ", clean_text)
clean_text = re.sub(r"www\.\S+", "", clean_text)

# removes any additional white space (e.g: "I like      cats   .") 
clean_text = re.sub(" +", " ", clean_text)
    
# this turns my chunk of text into a list of sentences
# some of the sentences aren't quite right, as I think it counts a sentence as words in between two
# sets of punctuation, so if some text got extracted that wasn't supposed to be there, it may get
# squished into a sentence
# I think that's just the nature of using a ML approach, it's not always going to be 100% accurate
clean_sentences = sent_tokenize(clean_text)
print(len(clean_sentences))


319


In [70]:
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'al', 'etc', 'e.g', 'i.e', 'fig'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
with open("preprocessed/variations.txt", "r") as file:
        # returns one string with all of the text
    raw_text = file.read()
file.close()

raw_sentences = sentence_splitter.tokenize(re.sub(" +", " ", raw_text))
print(len(raw_sentences))

85


In [45]:
for i in range(len(clean_sentences)):
    if clean_sentences[i] not in raw_sentences:
        print("Sentence", i)
        print("Clean sentence:", clean_sentences[i])
        print()
        print("Raw sentence:", raw_sentences[i])
        print("-----\n")


Sentence 1
Clean sentence: * Department of Humanities, Social Sciences and Cultural Industries, University of Parma, Borgo Carissimi , , Parma, Italy > Institute of Psychology, University of Lausanne, Géopolis, CH-, Lausanne, Switzerland © Centre for Excessive Gambling, Addiction Medicine, Lausanne University Hospitals (CHUV), CH-, Lausanne, Switzerland University of Sydney, Faculty of Medicine and Health, Sydney Medical School, Nepean Clinical School, PO Box , Penrith, NSW, , Australia © Faculty of Human and Social Sciences, UKE - Kore University of Enna, Cittadella Universitaria, , Enna, Italy meyers.

Raw sentence: * Department of Humanities, Social Sciences and Cultural Industries, University of Parma, Borgo Carissimi 10, 43121, Parma, Italy > Institute of Psychology, University of Lausanne, Géopolis, CH-1015, Lausanne, Switzerland © Centre for Excessive Gambling, Addiction Medicine, Lausanne University Hospitals (CHUV), CH-1004, Lausanne, Switzerland 4 University of Sydney, Facult

In [50]:
# remove punctuation and make all letters lowercase
#clean_sentences = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in raw_sentences]

stop_words = stopwords.words('english')

# Removes stop words (using the list of stop words from NLTK) and returns
# A list of lists, with each list containing the words in each sentence
sentence_tokens = [[words for words in word_tokenize(sentence) if words not in 
                    stop_words] for sentence in clean_sentences]
print(sentence_tokens)

[['alessandro', 'musetti', 'tommaso', 'manari', 'joél', 'billieux', 'vladan', 'starcevic', 'adriano', 'schimmenti'], ['department', 'humanities', 'social', 'sciences', 'cultural', 'industries', 'university', 'parma', 'borgo', 'carissimi', '10', '43121', 'parma', 'italy', 'institute', 'psychology', 'university', 'lausanne', 'géopolis', 'ch1015', 'lausanne', 'switzerland', 'centre', 'excessive', 'gambling', 'addiction', 'medicine', 'lausanne', 'university', 'hospitals', 'chuv', 'ch1004', 'lausanne', 'switzerland', '4', 'university', 'sydney', 'faculty', 'medicine', 'health', 'sydney', 'medical', 'school', 'nepean', 'clinical', 'school', 'po', 'box', '63', 'penrith', 'nsw', '2751', 'australia', 'faculty', 'human', 'social', 'sciences', 'uke', 'kore', 'university', 'enna', 'cittadella', 'universitaria', '94100', 'enna', 'italy', 'meyers'], ['social', 'networks', 'problematic', 'use', 'social', 'networks', 'attachment', 'attachment', 'anxiety', 'attachment', 'avoidance', 'systematic', 'revi

In [74]:
def get_clean_line(sentence):
        # this removes any instance of [<number>] as some papers use this as a way of citing
    clean_text = re.sub(r"\[\d+\]", "", sentence)

    # removes any instance of an in-text citation following any of these formats:
    # (Smith & Johnson, 2019), (Smith, 2019), (Smith et al., 2019), (Smith & Johnson, 2019; James, 2019)

    clean_text = re.sub(r"\s\((?:(?:[\w \.&]+\, )+[0-9]{4}[;|:]*\s*)+\)", "", clean_text)

    # need to remove instances of citations within sentences (e.g "Smith et al. (2018) said that....")
    # as these cause the sentences to get split up where they aren't supposed to
    clean_text = re.sub("(et al\.)", "et al", clean_text)
    clean_text = re.sub("(e\.g\.)", "e.g", clean_text)
    clean_text = re.sub("(i\.e\.)", "i.e", clean_text)
    clean_text = re.sub("(etc\.)", "etc", clean_text)
    clean_text = re.sub("(Fig\.)", "Fig", clean_text)
    clean_text = re.sub("(Table \w+)", "Table", clean_text)


    # also need to remove numbers, includes decimals
    clean_text = re.sub(r"\d+(\.[0-9]+)*", "", clean_text)
    clean_text = re.sub(r"{.+}@\S+", "", clean_text)


    # removes any URLs
    #clean_text = re.sub(r"http\S+", "", clean_text)
    clean_text = re.sub(r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#“”"-]*[\w@?^=%&\/~+“”"#-])', " ", clean_text)
    clean_text = re.sub(r"www\.\S+", "", clean_text)

    # removes any additional white space (e.g: "I like      cats   .") 
    clean_text = re.sub(" +", " ", clean_text)
    return clean_text

In [76]:
clean_sentences = []
for sentence in raw_sentences:
    clean_sentences.append(get_clean_line(sentence))
        
#clean_sentences = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in clean_sentences]
# stop_words = stopwords.words('english')

# # Removes stop words (using the list of stop words from NLTK) and returns
# # A list of lists, with each list containing the words in each sentence
# sentence_tokens = [[words for words in word_tokenize(sentence) if words not in 
#                     stop_words] for sentence in clean_sentences]
print(raw_sentences)

['Federico Barrios!, Federico Lépez!, Luis Argerich!, Rosita Wachenchauzer!?', 'Abstract.', 'This article presents new alternatives to the similarity function for the TextRank algorithm for automated summarization of texts.', 'We describe the generalities of the algorithm and the different functions we propose.', 'Some of these variants achieve a significative improvement using the same metrics and dataset as the original publication.', 'Keywords: TextRank variations, automated summarization, Information Retrieval ranking functions In the field of natural language processing, an extractive summarization task can be described as the selection of the most important sentences in a document.', 'Using different levels of compression, a summarized version of the document of arbitrary length can be obtained.', 'TextRank is a graph-based extractive summarization algorithm.', 'It is domain and language independent since it does not require deep linguistic knowledge, nor domain or language speci