In [1]:
import re
import string
import nltk

import pandas as pd

from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

STOP_WORDS = stopwords.words()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mahlo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahlo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def cleaning(text):
    """
    Convert to lowercase.
    Rremove URL links, special characters and punctuation.
    Tokenize and remove stop words.
    """
    print(f'Cleaning {text}')
    text = text.lower()
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[’“”…]', '', text)

    # removing the stop-words
    text_tokens = word_tokenize(text)
    tokens_without_sw = [
        word for word in text_tokens if not word in STOP_WORDS]
    filtered_sentence = (" ").join(tokens_without_sw)
    text = filtered_sentence

    return text

In [5]:
input_file = 'class_titles.csv'
max_rows = None
df = pd.read_csv(input_file,
                     delimiter = ',',
                     nrows = max_rows,
                     engine = "python")

df.head()
print(df['ï»¿title'])

0                                           '--
1                         Dev - Data Correction
2                            Dev - Tasking Work
3              > Annual Leave / Vacation (Paid)
4                            > Lunch/Break time
                         ...                   
1613                          YourAdministrator
1614                ZZConnect COVID Development
1615    ZZContract - Tech Assessment - Customer
1616           ZZHA Internal Meetings - Platf..
1617                                        NaN
Name: ï»¿title, Length: 1618, dtype: object


In [7]:
if __name__ == "__main__":
    max_rows = len(df['ï»¿title'])-1  # 'None' to read whole file
    df = pd.read_csv(input_file,
                     delimiter = ',',
                     nrows = max_rows,
                     engine = "python")

    dt = df['ï»¿title'].apply(cleaning)

    word_count = Counter(" ".join(dt).split()).most_common(10)
    word_frequency = pd.DataFrame(word_count, columns = ['Word', 'Frequency'])
    print(word_frequency)

Cleaning '--
Cleaning  Dev - Data Correction
Cleaning  Dev - Tasking Work
Cleaning > Annual Leave / Vacation (Paid)
Cleaning > Lunch/Break time
Cleaning > Meeting
Cleaning > Not Working
Cleaning > Project Badger
Cleaning > Research
Cleaning > Sick Leave
Cleaning > Training
Cleaning > Unpaid Leave
Cleaning 00_Office
Cleaning 01 - 1099's Forms
Cleaning 01_Homeoffice
Cleaning 0101 - Calls
Cleaning 0102 - Data Entry 
Cleaning 0103 - Emails
Cleaning 0104 - Mail Management
Cleaning 0105 - Meetings
Cleaning 0106 - Review
Cleaning 02 - Administrative Services
Cleaning 020 Planning
Cleaning 0201 - Calls
Cleaning 0202 - Data Entry
Cleaning 0203 - Emails
Cleaning 0204 - Mail Management
Cleaning 0205 - Meetings
Cleaning 0206 - Scan
Cleaning 0207 - Review
Cleaning 03 - Bookkeeping 
Cleaning 030 Strategy
Cleaning 0301 - Calls
Cleaning 0302 - Emails
Cleaning 0303 - Data Entry
Cleaning 0304 - Download Information
Cleaning 0305 - Meetings
Cleaning 0306 - Reports
Cleaning 0307 - Request Information
Clea

Cleaning Client Work - NB
Cleaning Client Work - Not Billable
Cleaning Client Work - Overtime-After Hours
Cleaning Client Work (NC)
Cleaning Client Work (Urgent Rate)
Cleaning Client Work Outsourced
Cleaning Client Work Remote
Cleaning Client/Status
Cleaning Client: Auditing
Cleaning Client: Consultancy
Cleaning Client: Content Creation
Cleaning Client: Recommendations
Cleaning Client: Research
Cleaning Client: Updates
Cleaning Closing
Cleaning CLUBS LSV
Cleaning CLUBS SLSQ
Cleaning CM: Comms
Cleaning CM: Mgmt & Planning
Cleaning CN - Lead Proposal
Cleaning CN - Lead Research
Cleaning CN - Research
Cleaning Coaching
Cleaning Coffee and Capital
Cleaning Cold Call
Cleaning Cold call/Contact
Cleaning Collaborator
Cleaning Collect Blood Delivery
Cleaning Collection
Cleaning Collection from Office
Cleaning Collections
Cleaning Colorado
Cleaning Comment
Cleaning Committee work
Cleaning Communicating
Cleaning Communication
Cleaning Community service (pro-bono)
Cleaning Community/Learning
Clea

Cleaning Partner
Cleaning Partner Account Management
Cleaning Partnerships
Cleaning Passive Income Visa
Cleaning Patching & Upgrade
Cleaning Payments
Cleaning Payroll
Cleaning Payroll Processing
Cleaning Payroll Report
Cleaning Payroll Tax Forms
Cleaning PD Session 
Cleaning Pennsylvania
Cleaning Performance Management
Cleaning Periodical Reports
Cleaning Permanent Residency
Cleaning Personal
Cleaning Personal Apt.
Cleaning Personal Learning & Development
Cleaning Personal Leave / Sick Leave
Cleaning Personal Time
Cleaning Personal/Carer's Leave
Cleaning Philanthropic
Cleaning Philly Startup Leaders
Cleaning Philly Tech Meetup
Cleaning Phone Attendance
Cleaning Phone Call
Cleaning Phone Call Support
Cleaning Phone Support
Cleaning Photography & Video
Cleaning Planning
Cleaning Planning - Requirements
Cleaning Planning/Sales 
Cleaning PM & Admin
Cleaning Potential Client Activity
Cleaning PPC
Cleaning Prepare and Issue a Fee Proposal
Cleaning Prepare Monthly Financial Statements
Cleanin