# Data Preprocessing

In [1]:
#uncomment this if you don't have the modules installed
#!pip install word2number 
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [2]:
#importing modules
import pandas as pd
import os

In [3]:
#importing scrapped data
articles = pd.read_csv('../../data/articles.csv')

In [4]:
#inspecting data
articles.head()

Unnamed: 0,Title,URL,Text
0,A step toward safe and reliable autopilots for...,/2023/safe-and-reliable-autopilots-flying-0612,"\nIn the film “Top Gun: Maverick,” Maverick, p..."
1,Three Spanish MIT physics postdocs receive Bot...,/2023/botton-foundation-fellowships-spanish-mi...,"\nThree Spanish MIT postdocs, Luis Antonio Ben..."
2,Bringing the social and ethical responsibiliti...,/2023/bringing-social-ethical-responsibilities...,\nThere has been a remarkable surge in the use...
3,New model offers a way to speed up drug discov...,/2023/new-model-offers-speedy-drug-discovery-0608,\nHuge libraries of drug compounds may hold po...
4,MIT researchers make language models scalable ...,/2023/language-models-scalable-self-learners-0608,\nSocrates once said: “It is not the size of a...


In [5]:
#let's look at our articles
articles['Text'][0]

'\nIn the film “Top Gun: Maverick,” Maverick, played by Tom Cruise, is charged with training young pilots to complete a seemingly impossible mission — to fly their jets deep into a rocky canyon, staying so low to the ground they cannot be detected by radar, then rapidly climb out of the canyon at an extreme angle, avoiding the rock walls. Spoiler alert: With Maverick’s help, these human pilots accomplish their mission.\nA machine, on the other hand, would struggle to complete the same pulse-pounding task. To an autonomous aircraft, for instance, the most straightforward path toward the target is in conflict with what the machine needs to do to avoid colliding with the canyon walls or staying undetected. Many existing AI methods aren’t able to overcome this conflict, known as the stabilize-avoid problem, and would be unable to reach their goal safely.\nMIT researchers have developed a new technique that can solve complex stabilize-avoid problems better than other methods. Their machine-

### We can see that we have some unicode codes, we want to check what other unicode values outside basic latin we could find.

In [6]:
#checking unicode char > 127
def find_nonlatin_chars(list_of_text):
    """
    Finds and returns a list of distinct unicode characters in a list of texts.

    The function iterates over each character in each text in the provided list. If the 
    unicode representation (ord) of the character is greater than 127 (indicating it is 
    a non-ASCII character), the character is added to a set (which automatically 
    handles duplication). The function finally returns a list of these unique unicode characters.

    Parameters:
    list_of_text (list of str): A list of texts in which to find unicode characters.

    Returns:
    list of str: A list of distinct unicode characters found in the input texts.
    """

    chrs = set()
    for text in list_of_text:
        for char in text:
            if ord(char) > 127:
                chrs.add(char)
    return list(chrs)

In [7]:
article_list = articles['Text']  
unicode_chars = find_nonlatin_chars(article_list)
print(unicode_chars)

['ã', '\xa0', 'é', 'ą', '…', 'á', '‘', 'ü', 'λ', '·', 'ð', '―', 'Â', 'â', '¨', 'Ö', 'š', 'ä', '–', '“', 'É', '\u202f', '\u2009', 'ë', '¶', 'ï', 'Ã', 'í', '¿', '§', 'ñ', 'ö', 'Á', 'ğ', '•', 'ﬂ', 'Ì', 'Å', '\u200b', 'è', '½', 'ú', '”', 'Ş', 'č', '¦', 'ć', 'ç', '\u2028', '©', '€', '’', 'à', '—', 'ó', '\xad', 'å', 'ê']


### We do have some unicode values we will normalize them. Normalization in this context is changing chars into latin chars. Example: über and uber, we want to change ü -> u.

In [8]:
#normalizing nonlatin char
import unidecode

def normalize_nonlatin(list_of_text):
    return [unidecode.unidecode(text) for text in list_of_text]


In [9]:
nltxt = normalize_nonlatin(article_list)

### Now let's build a function that searches for characters in list of our texts.

In [10]:
def starts_with(list_of_text, char):
    """
    Returns a list of texts that start with the given character.

    Parameters:
    list_of_text (list of str): List of texts.
    char (str): Character to match at the start of the text.

    Returns:
    list of str: Texts starting with the specified character.
    """
    
    words = []
    for txt in list_of_text:
        if txt[0] == char:
            words.append(txt)
    return words


def contains_char(list_of_text, char):
    """
    Returns a list of texts that contain the given character.

    Parameters:
    list_of_text (list of str): List of texts.
    char (str): Character to look for in the texts.

    Returns:
    list of str: Texts containing the specified character.
    """
    
    words = []
    for txt in list_of_text:
        if char in txt:
            words.append(txt)
    return words

### Before we use our new function, we will implement tokenization for easier EDA (Exploratory Data Analysis). Tokenization is process of splitting text into smaller parts. In our case it will be words, but tokens can be buch of things, like sentences, n-grams, characters...

In [11]:
import nltk
#nltk.download('punkt')  #uncomment this line if you don't have it installed

def distinct_tokens(documents):
    """
    Returns a list of distinct tokens (words) from the given list of documents.

    Parameters:
    documents (list of str): Input documents to tokenize.

    Returns:
    list: Unique tokens across all documents.
    """
    tokens = set()
    for document in documents:
        tokens.update(nltk.word_tokenize(document))
    return list(tokens)


In [12]:
dtokens_nltxt = distinct_tokens(nltxt)
contains_char(dtokens_nltxt, '.')

['behaviors.The',
 'beat.Packed',
 'CSAIL.Achieving',
 'fundraising.Q',
 'mobility.Other',
 'N.',
 'outcome.The',
 'movement.To',
 'estimation.Then',
 'time.Describing',
 'enough.Negative',
 '.Representative',
 'recalled.After',
 'Award.Minsky',
 'tasks.Sze',
 'counterpart.The',
 'like.That',
 'disaster.Over',
 'features.This',
 'said.MIT-Manus',
 '2.680',
 '6.S082/6.888',
 'J.K.',
 'solutions.Often',
 'taken.Joining',
 'writes.In',
 'cars.But',
 'mission.To',
 'say.Ilker',
 '7.5',
 '16.89',
 'problems.The',
 '1972.Commenting',
 'collisions.In',
 'study.He',
 'vs.',
 'architecture.This',
 'says.Last',
 'potential.The',
 'researchers.In',
 '14.01',
 'curriculum.One',
 'labor.Compared',
 'STS.047',
 '//www.bell-labs.com',
 'factors.Traditionally',
 'forcing.I',
 'says.One',
 'Adobe.Bring',
 'provided.The',
 'operable.One',
 'said.Lucent',
 'industry.This',
 'devices.Companies',
 'briefly.Still',
 'explains.Each',
 'competition.High',
 'intervention.The',
 'W.M',
 'performance.In',
 'laug

### This helped us to chatch that some sentences seem to be not seperated by a spacebar. Let's fix it!

In [13]:
#first lets see how an article without seperation looks like
for txt in nltxt:
    if 'said.Key' in txt:
        print(txt)


Consider the fish: highly maneuverable and an effortless swimmer, this animal 160 million years in the making is superbly adapted to its watery environs. Now, in work that could lead to miniature submarines with similar attributes, MIT engineers have developed the first robotic version of nature's piscine wonder.In mid July the researchers' creation, patterned after a bluefin tuna, took its maiden swim down the MIT testing tank. That swim and others since have been flawless, reinforcing the engineers' belief that the Lycra-sheathed robot could become an important tool toward understanding the physics of swimming and more.The "robotuna" project began about three years ago with the overall goal of developing a better propulsion system for autonomous underwater vehicles, or AUVs, said Michael S. Triantafyllou, a professor of ocean engineering who is leading the research team. AUVs are small robotic submarines that have great potential for mapping the ocean floor, finding the sources of u

### The issue seems to be with the way the text was seperated by the paragraphs. Check this article yourself and compare with what we have: https://news.mit.edu/1994/robotuna-0921.

In [14]:
import re

# Add a space after a period, question mark, or exclamation mark if not already present
def add_space(list_of_texts):
    """
    Adds a space after punctuation that is not followed by one in each text from the list of texts.

    Parameters:
    list_of_texts (list of str): Input list of text strings.

    Returns:
    list of str: Texts with added spaces after punctuation.
    """
    spaced_texts = []
    for text in list_of_texts:
        # Replace ". " with ".\n" only if it's after a name
        spaced_texts.append( re.sub(r'(?<=[\.\?\!])(?=[^\s])', r' ', text))
    return spaced_texts

In [15]:
spaced_txt = add_space(nltxt)
dtokens_spaced_txt = distinct_tokens(spaced_txt)
contains_char(dtokens_spaced_txt, '.')

['product.',
 'graduate.',
 'N.',
 'prompt.',
 'startups.',
 'backgrounds.',
 'living.',
 'Papertowns.',
 'over.',
 'curriculum.',
 'ambassador.',
 'haystack.',
 'vs.',
 'inferences.',
 'source.',
 'laser.',
 'expression.',
 'flow.',
 'g.',
 'advances.',
 'feedback.',
 'box.',
 'goals.',
 'created.',
 "doesn't.",
 'languages.',
 'creating.',
 'parents.',
 'way.',
 'lock.',
 'fish.',
 'accountable.',
 'exploding.',
 'leveraging.',
 'significantly.',
 'cockroaches-Dr.',
 'result.',
 'together.',
 'docking.',
 'privacy.',
 'pool.',
 'less.',
 'behavior.',
 'customers.',
 'brain.',
 'hospitals.',
 '1.',
 'model.',
 'amazing.',
 'Charles.',
 'then.',
 'made.',
 'bilinguals.',
 'rucksacks.',
 'equation.',
 'great.',
 'Core.',
 'Oct.',
 'agents.',
 'of.',
 'computation.',
 'life.',
 'rhinoceros.',
 'snack.',
 'reward.',
 'worldwide.',
 'thing.',
 'biases.',
 'sector.',
 'property.',
 'labels.',
 'case.',
 'America.',
 'Quest.',
 'Inc.',
 'Odyssey.',
 'become.',
 'generation.',
 'injustices.',

### Great! Looks like now we have words propperly sepperated. Now let's remove unnessesary characters.

In [16]:
def remove_non_alpha(list_of_words):
    """
    This function removes all non-alphabetical characters from each word in a list.
    
    It also converts all uppercase letters to lowercase.
    
    Parameters:
    list_of_words (list): A list of words to be processed. 
    
    Returns:
    list: A list of words with only lower-case alphabetical characters.
    """
    processed_words = []
    for word in list_of_words:
        cleaned_word = ''.join(ch.lower() for ch in word if ch.isalpha() or ch.isspace())
        processed_words.append(cleaned_word)
    return processed_words


In [17]:
noa_txt = remove_non_alpha(spaced_txt)
dtokens_noa_txt = distinct_tokens(noa_txt)
contains_char(noa_txt, '.')

[]

In [18]:
noa_txt

['\nin the film top gun maverick maverick played by tom cruise is charged with training young pilots to complete a seemingly impossible mission  to fly their jets deep into a rocky canyon staying so low to the ground they cannot be detected by radar then rapidly climb out of the canyon at an extreme angle avoiding the rock walls spoiler alert with mavericks help these human pilots accomplish their mission\na machine on the other hand would struggle to complete the same pulsepounding task to an autonomous aircraft for instance the most straightforward path toward the target is in conflict with what the machine needs to do to avoid colliding with the canyon walls or staying undetected many existing ai methods arent able to overcome this conflict known as the stabilizeavoid problem and would be unable to reach their goal safely\nmit researchers have developed a new technique that can solve complex stabilizeavoid problems better than other methods their machinelearning approach matches or 

### Now let's remove the newline element (\n).

In [19]:
def replace_in_documents(list_of_replacements, list_of_text):
    """
    Function to remove all substrings that match any of a list of regular expressions from 
    each string in a list of texts.

    Parameters:
    list_of_regex (list): List of regular expressions. Each regular expression is a string.
    list_of_text (list): List of texts to be processed.

    Returns:
    list: A list of texts after removing all substrings that match any of the regular expressions.
    """
    
    lot = []
    for text in list_of_text:
        for replacement in list_of_replacements:
            lot.append(re.sub(replacement, ' ', text))
    return lot

In [20]:
replaced_txt = replace_in_documents(['\n'], noa_txt)
print(replaced_txt[0])

 in the film top gun maverick maverick played by tom cruise is charged with training young pilots to complete a seemingly impossible mission  to fly their jets deep into a rocky canyon staying so low to the ground they cannot be detected by radar then rapidly climb out of the canyon at an extreme angle avoiding the rock walls spoiler alert with mavericks help these human pilots accomplish their mission a machine on the other hand would struggle to complete the same pulsepounding task to an autonomous aircraft for instance the most straightforward path toward the target is in conflict with what the machine needs to do to avoid colliding with the canyon walls or staying undetected many existing ai methods arent able to overcome this conflict known as the stabilizeavoid problem and would be unable to reach their goal safely mit researchers have developed a new technique that can solve complex stabilizeavoid problems better than other methods their machinelearning approach matches or excee

### Let's standarize numbers in our text. We want to check if there occurs textual representation of words (Exaple: 'Two') and change it to numerical (Example: 2).

In [21]:
#let's check if we have any numerical words
for txt in replaced_txt[0:10]:
    if 'one' in txt:
        print(txt)

 in the film top gun maverick maverick played by tom cruise is charged with training young pilots to complete a seemingly impossible mission  to fly their jets deep into a rocky canyon staying so low to the ground they cannot be detected by radar then rapidly climb out of the canyon at an extreme angle avoiding the rock walls spoiler alert with mavericks help these human pilots accomplish their mission a machine on the other hand would struggle to complete the same pulsepounding task to an autonomous aircraft for instance the most straightforward path toward the target is in conflict with what the machine needs to do to avoid colliding with the canyon walls or staying undetected many existing ai methods arent able to overcome this conflict known as the stabilizeavoid problem and would be unable to reach their goal safely mit researchers have developed a new technique that can solve complex stabilizeavoid problems better than other methods their machinelearning approach matches or excee

In [22]:
from word2number import w2n

def words_to_num(list_of_text):
    """
    Function to convert numerical words (like 'one', 'two', etc.) in a list of documents to their corresponding number representation.

    Parameters:
    documents (list): A list of documents (strings), each of which may contain numerical words.

    Returns:
    list: A list of documents with numerical words replaced by their number representation.
    """
    # Initialize the list to store processed documents
    processed_documents = []
    
    # Iterate over each document in the list
    for doc in list_of_text:
        # Split the document into a list of words
        words = doc.split()

        # Iterate over each word in the list
        for i in range(len(words)):
            try:
                # Try to convert the word to its number representation using word_to_num function from w2n module
                # If successful, replace the word with its number representation in the words list
                words[i] = str(w2n.word_to_num(words[i]))
            except ValueError:
                # If word_to_num throws a ValueError (i.e., the word is not a number word), skip to the next word
                pass
        
        # Join the words back into a document and add it to the processed_documents list
        processed_documents.append(" ".join(words))
    
    # Return the list of processed documents
    return processed_documents


In [23]:
wtn_txt = words_to_num(replaced_txt)
dtokens_wtn_txt = distinct_tokens(wtn_txt)
contains_char(dtokens_wtn_txt, 'two')

['neuralnetworkbased',
 'twofingered',
 'neuralnetwork',
 'sixtwoseventy',
 'twocharacter',
 'twocultures',
 'atwood',
 'networks',
 'qnetwork',
 'subnetwork',
 'twoyearold',
 'twoweek',
 'networksecurity',
 'neuralnetworkmodel',
 'twolegged',
 'twofinger',
 'artworks',
 'twowheeled',
 'twoword',
 'twophase',
 'twoarmed',
 'atwoods',
 'twoyear',
 'twohour',
 'twostory',
 'twonode',
 'twometerlong',
 'networkbased',
 'westworld',
 'twoway',
 'seventytwo',
 'twomonth',
 'twometer',
 'twofold',
 'twoandahalf',
 'twodecade',
 'twodimensional',
 'networking',
 'network',
 'robotworld',
 'twoweeklong',
 'twobyfours',
 'networked',
 'artwork',
 'twopart',
 'mitwoods',
 'twolane',
 'twostage',
 'nextword',
 'untrustworthy',
 'twomonthold',
 'twoneuron',
 'trustworthy',
 'twoplayer',
 'twothirds',
 'twominute',
 'twonight',
 'subnetworks',
 'twoday',
 'trustworthiness']

### Stemming / lemmatization is process of reducing a word in their base form. For this issue we will use lemmatization. Lemmatization converts the word into lemma (it's base form).  Example: 'am', 'are', 'is' -> 'be'. This is used to normalize the text and improve our further analysis.

In [24]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# NLTK has a list of common stopwords in English that you can use
stop_words = set(stopwords.words('english'))

# Function to tokenize and remove stopwords from a single text
def process_text(text):
    """
    This function takes a text string as input, tokenizes it into individual words,
    removes common English stopwords, and returns the processed tokens as a list.
    
    Parameters:
    text (str): The input text to be processed.

    Returns:
    filtered_tokens (list): The list of processed tokens.
    """
    
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens



In [25]:
processed_texts = [process_text(text) for text in wtn_txt]

print(processed_texts[0])


['film', 'top', 'gun', 'maverick', 'maverick', 'played', 'tom', 'cruise', 'charged', 'training', 'young', 'pilots', 'complete', 'seemingly', 'impossible', 'mission', 'fly', 'jets', 'deep', 'rocky', 'canyon', 'staying', 'low', 'ground', 'detected', 'radar', 'rapidly', 'climb', 'canyon', 'extreme', 'angle', 'avoiding', 'rock', 'walls', 'spoiler', 'alert', 'mavericks', 'help', 'human', 'pilots', 'accomplish', 'mission', 'machine', 'hand', 'would', 'struggle', 'complete', 'pulsepounding', 'task', 'autonomous', 'aircraft', 'instance', 'straightforward', 'path', 'toward', 'target', 'conflict', 'machine', 'needs', 'avoid', 'colliding', 'canyon', 'walls', 'staying', 'undetected', 'many', 'existing', 'ai', 'methods', 'arent', 'able', 'overcome', 'conflict', 'known', 'stabilizeavoid', 'problem', 'would', 'unable', 'reach', 'goal', 'safely', 'mit', 'researchers', 'developed', 'new', 'technique', 'solve', 'complex', 'stabilizeavoid', 'problems', 'better', 'methods', 'machinelearning', 'approach', 

In [26]:
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

def lemmatize_documents(documents):
    """
    This function takes a list of documents, where each document is represented as a list of tokens. 
    It lemmatizes the tokens in each document using spaCy, and returns a list of documents with 
    the lemmatized tokens.
    
    Parameters:
    documents (list): The list of documents to be lemmatized. Each document is represented as a list of tokens.

    Returns:
    lemmatized_documents (list): The list of documents with lemmatized tokens. Each document is represented as a list of tokens.
    """
    lemmatized_documents = []  # Initialize the list of lemmatized documents

    for tokens in documents:
        # Join the tokens into a single string
        text = ' '.join(tokens)

        # Process the text with spaCy
        doc = nlp(text)

        # Lemmatize the tokens and add the lemmatized tokens to the list of lemmatized documents
        lemmatized_documents.append([token.lemma_ for token in doc])

    return lemmatized_documents  # Return the lemmatized documents

In [27]:
lemmatized_documents = lemmatize_documents(processed_texts)
print(lemmatized_documents[0])


['film', 'top', 'gun', 'maverick', 'maverick', 'play', 'tom', 'cruise', 'charge', 'training', 'young', 'pilot', 'complete', 'seemingly', 'impossible', 'mission', 'fly', 'jet', 'deep', 'rocky', 'canyon', 'stay', 'low', 'ground', 'detect', 'radar', 'rapidly', 'climb', 'canyon', 'extreme', 'angle', 'avoid', 'rock', 'wall', 'spoiler', 'alert', 'maverick', 'help', 'human', 'pilot', 'accomplish', 'mission', 'machine', 'hand', 'would', 'struggle', 'complete', 'pulsepounde', 'task', 'autonomous', 'aircraft', 'instance', 'straightforward', 'path', 'toward', 'target', 'conflict', 'machine', 'need', 'avoid', 'collide', 'canyon', 'wall', 'stay', 'undetecte', 'many', 'exist', 'ai', 'method', 'be', 'not', 'able', 'overcome', 'conflict', 'know', 'stabilizeavoid', 'problem', 'would', 'unable', 'reach', 'goal', 'safely', 'mit', 'researcher', 'develop', 'new', 'technique', 'solve', 'complex', 'stabilizeavoid', 'problem', 'well', 'method', 'machinelearne', 'approach', 'match', 'exceed', 'safety', 'exist'

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# Join the tokens back into documents
documents_joined = [' '.join(doc) for doc in lemmatized_documents]

# Create the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer and transform the data
cv = vectorizer.fit_transform(documents_joined)


In [29]:
col = vectorizer.get_feature_names_out()
data = cv.toarray()

In [30]:
token_df = pd.DataFrame(data, columns=col)
token_df.head()

Unnamed: 0,10,100,1000,1000000,1000000000,11,14,16,19,50,...,zoubin,zoya,zuber,zue,zurich,zurichs,zwicky,zwieback,zygouras,zytek
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
#store the cleaned text 
%store token_df
%store lemmatized_documents
%store wtn_txt
%store articles

Stored 'token_df' (DataFrame)
Stored 'lemmatized_documents' (list)
Stored 'wtn_txt' (list)
Stored 'articles' (DataFrame)


# Now let's proceed to modeling!