In [10]:
! pip install nltk

In [11]:
import pandas as pd

df = pd.read_csv('data.csv', header=None, names=['text', 'category'])

df.head(10)

Unnamed: 0,text,category
0,text,category
1,I'm interested in upgrading my internet plan. ...,Sales
2,I'm having trouble with my internet connection...,Service
3,What are your internet speeds like? I'm consid...,General
4,I want to know more about your internet packag...,Sales
5,My internet bill seems higher than usual this ...,Service
6,I'm moving to a new apartment next month and n...,Sales
7,I'm experiencing slow speeds with my current i...,Service
8,What are your customer service hours? I need h...,Service
9,I'm interested in upgrading my internet plan. ...,Sales


In [12]:
# lowercase
df['text'] = df['text'].str.lower()

df.head(10)

Unnamed: 0,text,category
0,text,category
1,i'm interested in upgrading my internet plan. ...,Sales
2,i'm having trouble with my internet connection...,Service
3,what are your internet speeds like? i'm consid...,General
4,i want to know more about your internet packag...,Sales
5,my internet bill seems higher than usual this ...,Service
6,i'm moving to a new apartment next month and n...,Sales
7,i'm experiencing slow speeds with my current i...,Service
8,what are your customer service hours? i need h...,Service
9,i'm interested in upgrading my internet plan. ...,Sales


In [13]:
# remove whitespaces

def remove_whitespaces(text):
    return ' '.join(text.split())

df['text'] = df['text'].apply(remove_whitespaces)

df.head(10)

Unnamed: 0,text,category
0,text,category
1,i'm interested in upgrading my internet plan. ...,Sales
2,i'm having trouble with my internet connection...,Service
3,what are your internet speeds like? i'm consid...,General
4,i want to know more about your internet packag...,Sales
5,my internet bill seems higher than usual this ...,Service
6,i'm moving to a new apartment next month and n...,Sales
7,i'm experiencing slow speeds with my current i...,Service
8,what are your customer service hours? i need h...,Service
9,i'm interested in upgrading my internet plan. ...,Sales


In [15]:
import nltk

nltk.download('punkt')

from nltk import word_tokenize

df['text'] = df['text'].apply(word_tokenize)

df.head(10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,category
0,[text],category
1,"[i, 'm, interested, in, upgrading, my, interne...",Sales
2,"[i, 'm, having, trouble, with, my, internet, c...",Service
3,"[what, are, your, internet, speeds, like, ?, i...",General
4,"[i, want, to, know, more, about, your, interne...",Sales
5,"[my, internet, bill, seems, higher, than, usua...",Service
6,"[i, 'm, moving, to, a, new, apartment, next, m...",Sales
7,"[i, 'm, experiencing, slow, speeds, with, my, ...",Service
8,"[what, are, your, customer, service, hours, ?,...",Service
9,"[i, 'm, interested, in, upgrading, my, interne...",Sales


In [28]:
# remove stopwords

nltk.download('stopwords')

from nltk.corpus import stopwords

en_stopwords = stopwords.words('english')
# type(en_stopwords)

en_stopwords.append("'m")
def remove_stopwords(text):
    result = []
    # iterate over text in a row
    for token in text:
        if token not in en_stopwords:
            result.append(token)

    return result

df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

df.head(10)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,category
0,[text],category
1,"[interested, upgrading, internet, plan, ., pro...",Sales
2,"[trouble, internet, connection, ., keeps, drop...",Service
3,"[internet, speeds, like, ?, considering, switc...",General
4,"[want, know, internet, packages, ., send, info...",Sales
5,"[internet, bill, seems, higher, usual, month, ...",Service
6,"[moving, new, apartment, next, month, need, se...",Sales
7,"[experiencing, slow, speeds, current, internet...",Service
8,"[customer, service, hours, ?, need, help, bill...",Service
9,"[interested, upgrading, internet, plan, ., pro...",Sales


In [29]:
# remove punctuation

from nltk.tokenize import RegexpTokenizer

def remove_puntuations(text):

    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(' '.join(text))
    return result

df['text'] = df['text'].apply(remove_puntuations)

df.head(10)

Unnamed: 0,text,category
0,[text],category
1,"[interested, upgrading, internet, plan, provid...",Sales
2,"[trouble, internet, connection, keeps, dropping]",Service
3,"[internet, speeds, like, considering, switchin...",General
4,"[want, know, internet, packages, send, informa...",Sales
5,"[internet, bill, seems, higher, usual, month, ...",Service
6,"[moving, new, apartment, next, month, need, se...",Sales
7,"[experiencing, slow, speeds, current, internet...",Service
8,"[customer, service, hours, need, help, billing...",Service
9,"[interested, upgrading, internet, plan, provid...",Sales


In [30]:
# Lemmatize

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

def lemmatize_text(text):
    result = []

    wordnet_lem = WordNetLemmatizer()

    for token, pos in pos_tag(text):
        pos = pos[0].lower()
        if pos not in ['a','n','v','r']:
            pos = 'n'
        result.append(wordnet_lem.lemmatize(token,pos))
    return result

df['text'] = df['text'].apply(lemmatize_text)

df.head(10)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...


Unnamed: 0,text,category
0,[text],category
1,"[interested, upgrading, internet, plan, provid...",Sales
2,"[trouble, internet, connection, keep, drop]",Service
3,"[internet, speed, like, consider, switch, prov...",General
4,"[want, know, internet, package, send, informat...",Sales
5,"[internet, bill, seem, higher, usual, month, h...",Service
6,"[move, new, apartment, next, month, need, set,...",Sales
7,"[experience, slow, speed, current, internet, p...",Service
8,"[customer, service, hour, need, help, bill, is...",Service
9,"[interested, upgrading, internet, plan, provid...",Sales


In [31]:
# remove words with len less than 1

def remove_words(text):
    result = []
    for word in text:
        if len(word)>1:
            result.append(word)
    return result

df['text'] = df['text'].apply(lambda x: remove_words(x))

df.head(10)


Unnamed: 0,text,category
0,[text],category
1,"[interested, upgrading, internet, plan, provid...",Sales
2,"[trouble, internet, connection, keep, drop]",Service
3,"[internet, speed, like, consider, switch, prov...",General
4,"[want, know, internet, package, send, informat...",Sales
5,"[internet, bill, seem, higher, usual, month, h...",Service
6,"[move, new, apartment, next, month, need, set,...",Sales
7,"[experience, slow, speed, current, internet, p...",Service
8,"[customer, service, hour, need, help, bill, is...",Service
9,"[interested, upgrading, internet, plan, provid...",Sales


In [33]:
df['text'] = [' '.join(map(str,token)) for token in df['text']]

df.head(10)

Unnamed: 0,text,category
0,text,category
1,interested upgrading internet plan provide option,Sales
2,trouble internet connection keep drop,Service
3,internet speed like consider switch provider,General
4,want know internet package send information,Sales
5,internet bill seem higher usual month help und...,Service
6,move new apartment next month need set interne...,Sales
7,experience slow speed current internet plan an...,Service
8,customer service hour need help bill issue,Service
9,interested upgrading internet plan provide option,Sales


In [34]:
df.to_csv('./pdata.csv', index = False, encoding='utf-8')