#  Azure Text Classification - preprocessed vs unprocessed data

In [3]:
import os
import pandas as pd
import numpy as np

Below I loop through the text files in the data folder, create a dictionary with the file's text and name, and then save the dictionaries in a list named `text_data`. The list is then convereted into a dataframe and saved as data.csv.

In [4]:
text_data = []

for root, directories, files in os.walk("data"):
    for filename in files:
        if filename.endswith(".txt"):
            text_data.append({"text": open(f"data/{filename}").read(), "filename": filename})

In [5]:
df = pd.DataFrame(text_data)
df.to_csv('data.csv', index=False)

In [6]:
df = pd.read_csv('data.csv')

In [7]:
df.head(5)

Unnamed: 0,text,filename
0,A versatile potentiostat based on inexpensive ...,060.txt
1,Introduction: Family plays an important role i...,074.txt
2,A novel hybrid system composed of a photocatal...,048.txt
3,"In Einstein-aether theory, violating Lorentz i...",114.txt
4,"With the vigorous spread of renewable energy, ...",100.txt


## Feature Extraction and Preprocessing 

The following feature extraction and preprocessing code is based on the work found here: https://github.com/EnesGokceDS/Amazon_Reviews_NLP_Capstone_Project

In [8]:
import wordcloud
from nltk.corpus import stopwords
import nltk
import string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patmellon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/patmellon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/patmellon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/patmellon/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
# Count the stopwords
df['stopwords'] = df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
df[['text','stopwords']].head()

Unnamed: 0,text,stopwords
0,A versatile potentiostat based on inexpensive ...,42
1,Introduction: Family plays an important role i...,84
2,A novel hybrid system composed of a photocatal...,40
3,"In Einstein-aether theory, violating Lorentz i...",45
4,"With the vigorous spread of renewable energy, ...",72


In [10]:
# Count the punctuation
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return count

df['punctuation'] = df['text'].apply(lambda x: count_punct(x))

In [11]:
df[['text','punctuation']].head()

Unnamed: 0,text,punctuation
0,A versatile potentiostat based on inexpensive ...,12
1,Introduction: Family plays an important role i...,27
2,A novel hybrid system composed of a photocatal...,23
3,"In Einstein-aether theory, violating Lorentz i...",27
4,"With the vigorous spread of renewable energy, ...",26


In [12]:
# Count the numbers
df['numerics'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df[['text','numerics']].head()

Unnamed: 0,text,numerics
0,A versatile potentiostat based on inexpensive ...,0
1,Introduction: Family plays an important role i...,1
2,A novel hybrid system composed of a photocatal...,2
3,"In Einstein-aether theory, violating Lorentz i...",0
4,"With the vigorous spread of renewable energy, ...",1


In [13]:
# Count the uppercase words
df['upper'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df[['text','upper']].head()

Unnamed: 0,text,upper
0,A versatile potentiostat based on inexpensive ...,1
1,Introduction: Family plays an important role i...,2
2,A novel hybrid system composed of a photocatal...,12
3,"In Einstein-aether theory, violating Lorentz i...",0
4,"With the vigorous spread of renewable energy, ...",2


In [14]:
# Convert text to lowercase
df['text'] = df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['text'].head()

0    a versatile potentiostat based on inexpensive ...
1    introduction: family plays an important role i...
2    a novel hybrid system composed of a photocatal...
3    in einstein-aether theory, violating lorentz i...
4    with the vigorous spread of renewable energy, ...
Name: text, dtype: object

In [15]:
# Remove punctuation
df['text'] = df['text'].str.replace('[^\w\s]','')
df['text'].head()

  df['text'] = df['text'].str.replace('[^\w\s]','')


0    a versatile potentiostat based on inexpensive ...
1    introduction family plays an important role in...
2    a novel hybrid system composed of a photocatal...
3    in einsteinaether theory violating lorentz inv...
4    with the vigorous spread of renewable energy m...
Name: text, dtype: object

In [16]:
# Remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df['text'].sample(10)

6      several studies investigating causes delay pro...
134    simple fast sensitive accurate methodology bas...
81     neural circuit solve system simultaneous linea...
120    paper focuses analyzing concepts green buildin...
91     background erythropoiesisstimulating agents es...
27     every evolution fluid uniquely described energ...
48     purpose study examine health children born cos...
167    novel dispersive delay structure dds based com...
25     selection suitable working fluids use organic ...
78     prenatal maternal diet may influence disease s...
Name: text, dtype: object

In [17]:
# Correct spelling
from textblob import TextBlob
df['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    versatile potentiostat based expensive shelf c...
1    introduction family plays important role patie...
2    novel horrid system composed photocatalytic fu...
3    einsteinaether theory violating lorentz varian...
4    vigorous spread renewal energy much attention ...
Name: text, dtype: object

In [18]:
# Save the preprocessed_data as a new CSV
df.to_csv('processed_data.csv', index=False)

### Upload data to Azure

The below code is based on this Quickstart guide: https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python?tabs=environment-variable-linux.

In [20]:
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

load_dotenv()

AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")

blob_service_client = BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING)

try:
    print("Uploading...")
    
    # Loop through each row and upload to Azure
    for _index, row in df.iterrows():
        blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=row['filename'])
        # Set overwrite to true 
        blob_client.upload_blob(row['text'], overwrite=True)
    
    print("Finished")

except Exception as ex:
    print('Exception:')
    print(ex)

Azure Blob Storage v12.13.0 - Python quickstart sample


In [None]:
# Remove numbers, lemmatization, and whitespace

In [22]:
# Remove numbers
df['text'] = df['text'].str.replace('\d+', '')
df['text'].head()

  df['text'] = df['text'].str.replace('\d+', '')


0    versatile potentiostat based inexpensive shelf...
1    introduction family plays important role patie...
2    novel hybrid system composed photocatalytic fu...
3    einsteinaether theory violating lorentz invari...
4    vigorous spread renewable energy much attentio...
Name: text, dtype: object