In [215]:
import numpy as np 
import pandas as pd 
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [235]:
# Data Cleaning

df = pd.read_csv('smartthings_latest_1000.csv')
df.drop('index',axis=1,inplace=True)
df.drop('url', axis=1,inplace=True)
df = df.replace(np.nan, '', regex=True)
df.shape

(30000, 3)

In [223]:
# Cleaning: 
# remove HTML 
# Tokenization + Remove Punctuation
# Remove Stop Words
# Lemmatization or Stemming 


In [224]:
df

Unnamed: 0,title,tag,subtag
0,Public calendar for SmartThings developer events,Groups & Events,
1,Sonos Voice Notification Issues (Sound doesn't...,Devices & Integrations,
2,Ideas on triggering events once I wake up?,Projects & Stories,project_getstarted
3,17 Amcrest IP cams w NVR to Blue Iris and S.T,Devices & Integrations,
4,No Hub In Your Current Location (New App),Apps & Clients,
...,...,...,...
29995,Samsung receives ZigBee 3.0 certification,General Discussion,iotindustry
29996,Osram Lightify A19 RGBW and dimmable white,Devices & Integrations,
29997,Philips 464800 Hue White Ambiance BR30 LED Bul...,Deals,
29998,"Turn off ""Fan"" without turning off ""Roku"" in H...",Devices & Integrations,alexa


In [225]:
# remove html 
def remove_html(text): 
    soup = BeautifulSoup(text,'lxml')
    html_free = soup.get_text()
    return html_free

# Remove Punctuation
def remove_punctuation(text):
    no_punct ="".join([c for c in text if c not in string.punctuation])
    return no_punct

# Tokenize - split string into list of words based on RegEx
# Instantiate
tokenizer = RegexpTokenizer(r'\w+')

# Remove stop words - most frequently used words that have low predictive power.

def remove_stopwords(text): 
    words = [w for w in text if w not in stopwords.words('english')]
    return words

# Stemming and Lemitization - shorten words to root form 
# Lemmatizing - maps common words into one base. 
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

stemmer = PorterStemmer()
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text



In [226]:
# Tokenize
df1 = df.applymap(lambda x: tokenizer.tokenize(x.lower()))

#Remove Stop Words
df1 = df1.applymap(lambda x: remove_stopwords(x))

#Lemmatize
df1 = df1.applymap(lambda x: word_lemmatizer(x))

In [230]:
df1.to_csv(r'Clean_data_lemmatized.csv',index=False,header=True)

In [236]:
# Tokenize
df2 = df.applymap(lambda x: tokenizer.tokenize(x.lower()))

#Remove Stop Words
df2 = df2.applymap(lambda x: remove_stopwords(x))

#Lemmatize
df2 = df2.applymap(lambda x: word_stemmer(x))
    

In [234]:
df2

Unnamed: 0,title,tag,subtag
0,public calendar for smartth develop event,group event,
1,sono voic notif issu sound doesn t resum when ...,devic integr,
2,idea on trigger event onc i wake up,project stori,project_getstart
3,17 amcrest ip cam w nvr to blue iri and s t,devic integr,
4,no hub in your current locat new app,app client,
...,...,...,...
29995,samsung receiv zigbe 3 0 certif,gener discuss,iotindustri
29996,osram lightifi a19 rgbw and dimmabl white,devic integr,
29997,philip 464800 hue white ambianc br30 led bulb ...,deal,
29998,turn off fan without turn off roku in harmoni,devic integr,alexa


In [None]:
df2.to_csv(r'Clean_data_word_stemmer.csv',index=False,header=True)