#LOAD AND INSPECT DATASET


In [25]:
import pandas as pd # Load structured data into dataframes
import re # built in library to remove punctuation, symbols, and numbers (Text Cleaning)
import numpy as np # Enables fast numerical operations
import nltk # natural language toolkit
from nltk.corpus import stopwords # built in library full of stop words
from nltk.stem import WordNetLemmatizer # lemmatization reduces words to their dictionary form
# Example: 'delays' -> 'delay', 'issues' -> 'issue'
from nltk.sentiment import SentimentIntensityAnalyzer

In [26]:
from google.colab import files
uploaded = files.upload()

Saving text_data_unclean.csv to text_data_unclean (2).csv


In [27]:
df_text = pd.read_csv('text_data_unclean.csv')
df_text.head()
df_text.info()
df_text.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             400 non-null    int64  
 1   category       400 non-null    object 
 2   text           400 non-null    object 
 3   keyword_count  400 non-null    int64  
 4   sentiment      400 non-null    float64
 5   risk_score     400 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 18.9+ KB


Unnamed: 0,0
id,0
category,0
text,0
keyword_count,0
sentiment,0
risk_score,0


#CLEAN THE TEXT

In [28]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text) # -> Only keep letters a-z and spaces, remove numbers, punctuation, emojis, symbol, etc.
    text = re.sub(r'\s+', ' ', text).strip() # -> replace messy whitespaces with a single space, .strip() remove whitespaces at begining/end
    return text
df_text['clean_text'] = df_text['text'].apply(clean_text)

#REMOVE STOPWORDS

In [29]:
nltk.download('stopwords') # Library full of stopwords

stop_words = set(stopwords.words('english')) # -> convert from list to set for faster look up

def remove_stopwords(text):
    return ' '.join(
        [word for word in text.split() if word not in stop_words])
df_text['clean_text'] = df_text['clean_text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LEMMATIZATION

In [22]:
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) # -> loop and lematize each word

df_text['clean_text'] = df_text['clean_text'].apply(lemmatize_text)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#RECOMPUTE KEYWORD COUNT


In [23]:
keywords = [
    'shortage', 'delay', 'instability', 'import', 'demand', 'supply',
    'low', 'availability', 'issue', 'risk', 'critical', 'problem',
    'drop', 'decline', 'disruption', 'uncertain', 'crisis', 'volatile'
]

keyword_set = set(keywords)
# Checking if a word is in a set is O(1) time
# Checking in a list is O(n) time

def count_keywords(text):
    words = text.split()
    count = sum(1 for word in words if word in keyword_set)
    return count

df_text['keyword_count'] = df_text['clean_text'].apply(count_keywords)
display(df_text['keyword_count'])

Unnamed: 0,keyword_count
0,1
1,1
2,2
3,1
4,1
...,...
395,1
396,1
397,1
398,1


#RECOMPUTE SENTIMENT SCORE

In [32]:
nltk.download('vader_lexicon') # -> huge dictionary of words with sentiment weights

sia = SentimentIntensityAnalyzer()

def compute_sentiment(text):
    score = sia.polarity_scores(text)
    return score['compound']
    # -1 -> very negative
    # 0 -> neutral
    # 1 -> very positive

df_text['sentiment'] = df_text['clean_text'].apply(compute_sentiment)
display(df_text['sentiment'])

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,sentiment
0,0.0000
1,-0.2732
2,0.0000
3,0.0000
4,0.0000
...,...
395,-0.7184
396,-0.1280
397,0.0000
398,-0.6124


#RECOMPUTE RISK SCORE

In [35]:
def compute_risk(row):
    raw_risk = (5 * row['keyword_count']) - (5 * row['sentiment'])
    # raw_risk = a ⋅ keyword_count + b ⋅ sentiment
    risk = max(1, min(10, round(raw_risk)))
    # Transform it into a value to a value on the scale of 1-10
    return risk
df_text['risk_score'] = df_text.apply(compute_risk, axis=1)

In [36]:
df_text.to_csv("Preprocessed_text.csv", index=False)

In [37]:
from google.colab import files
files.download('Preprocessed_text.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>