# EDA and cleaning

In [213]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import text


In [125]:
df = pd.read_csv('corpus.csv')

In [126]:
df.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self
0,"Getting ""faux"" phone# for your business, to hi...",[removed],smallbusiness,1610836144,neovngr,1,1,True
1,What being a CEO of your company has taught you?,I am going to launch my business this year. I ...,smallbusiness,1610838573,ChampionshipAct,17,1,True
2,Small Business advice,Hi. I recently opened up my own painting busin...,smallbusiness,1610839361,oliverbutcher,0,1,True
3,"As a business in Glendale AZ, do I need to cha...",I'm sorry if this isn't the right place to ask...,smallbusiness,1610839408,NickyNice,4,1,True
4,shipping !,i’m needing help with shipping info. \ni will ...,smallbusiness,1610844945,vickyendgame,10,1,True


Remove [removed] from selftext

In [127]:
df['selftext'] = df['selftext'].map(lambda x: np.nan if x == '[removed]' else str(x))

In [128]:
df.dropna(subset = ['selftext'], inplace=True)

In [129]:
df.shape

(12302, 8)

Veryfied that I have 12302 rows to work with, having exclude the empty `'selftext'` rows, I proceed veryfing the number of posts I have for each subreddit.

In [130]:
df['subreddit'].value_counts()

smallbusiness    7247
startups         5055
Name: subreddit, dtype: int64

Small business sample size is bigger than startups, but both seem large enough to continue with the analysis.

In [131]:
(df['num_comments']).groupby(df['subreddit']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
smallbusiness,7247.0,11.180488,26.807854,0.0,2.0,5.0,11.0,968.0
startups,5055.0,10.792483,28.851644,0.0,1.0,4.0,11.0,809.0


Both samples seems to behave similarly: the average number of comment for posts in both subreddits is a little over 10, and, even if the largest amount of comments is a little higher for smallbusiness (968) than `startup` (809), they are in the same order of magnitude.

I check for duplicate posts in my sample, and establish that there are not.

In [132]:
df[df.duplicated()]

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self


I also want to add a feature with the all corpus of the post, merging the title and the post body.

In [133]:
df['alltext'] = df['title'] + ' ' + df['selftext']

**Target**

I want to predict 'subreddit', so I modify the current `subreddit` feature into `target` and set startups = 1, smallbusiness = 0.

In [134]:
df['target'] = df['subreddit'].map({'startups': 1, 'smallbusiness': 0})
df.drop('subreddit', axis=1, inplace=True)
df.head()

Unnamed: 0,title,selftext,created_utc,author,num_comments,score,is_self,alltext,target
1,What being a CEO of your company has taught you?,I am going to launch my business this year. I ...,1610838573,ChampionshipAct,17,1,True,What being a CEO of your company has taught yo...,0
2,Small Business advice,Hi. I recently opened up my own painting busin...,1610839361,oliverbutcher,0,1,True,Small Business advice Hi. I recently opened up...,0
3,"As a business in Glendale AZ, do I need to cha...",I'm sorry if this isn't the right place to ask...,1610839408,NickyNice,4,1,True,"As a business in Glendale AZ, do I need to cha...",0
4,shipping !,i’m needing help with shipping info. \ni will ...,1610844945,vickyendgame,10,1,True,shipping ! i’m needing help with shipping info...,0
5,What to do if company is growing too fast,I'm running our family business which provides...,1610846426,yellowflash85,56,1,True,What to do if company is growing too fast I'm ...,0


**Clean**

In [135]:
def cleaner(text):
    # Make lowercase
    text = text.lower()

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    
    # Remove punctuation and split 's, 't, 've with a space for filter
    text = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', text)
    
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', text)
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text = ''.join(c for c in text if c <= '\uFFFF') 
    
    return text


In [136]:
df['alltext'] = df['alltext'].apply(cleaner)

**Lemmatize**

In [137]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words = ''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words

In [138]:
df['alltext'] = df['alltext'].apply(lemmatize_words)

In [143]:
df.shape

(12302, 9)

In [142]:
df = df.reset_index(drop=True)

**Tokenize**

In [120]:
#tokenizer = RegexpTokenizer(r'\w+')

In [121]:
#def token_word(column):
  #  token_ls = []
 #   for row in column:
  #      t = word_tokenize(row.lower())
  #      token_ls.append(t)
  #  return token_ls

In [144]:
df.to_csv('corpus_clean.csv', index=False)

**Most common words**

In [204]:
# Easy way to get most frequently used words: change max_features
def common_words(i):
    count_vect = CountVectorizer(analyzer = "word", 
                                 tokenizer = None, 
                                 preprocessor = None,
                                 stop_words = "english", 
                                 max_features = 40) 

    # input for CountVectorizer is an array of strings
    vector_input = df[df['target'] == i]['alltext']

    # fit_transform the vectorizer
    words = count_vect.fit_transform(vector_input)

    # convert output to a Numpy array
    words = words.toarray()
    return  count_vect.get_feature_names()

In [208]:
startup_words = common_words(1)

In [209]:
print(startup_words)

['app', 'business', 'company', 'customer', 'don', 'experience', 'feedback', 'founder', 'good', 'ha', 'help', 'idea', 'job', 'just', 'know', 'like', 'looking', 'make', 'marketing', 'need', 'new', 'people', 'product', 'really', 'service', 'share', 'start', 'startup', 'thing', 'think', 'time', 'use', 'user', 'wa', 'want', 'way', 'work', 'working', 'x200b', 'year']


In [210]:
smb_words = common_words(0)

In [211]:
print(smb_words)

['advice', 'business', 'company', 'customer', 'day', 'don', 'employee', 'good', 'ha', 'help', 'idea', 'just', 'know', 'like', 'looking', 'make', 'money', 'month', 'need', 'new', 'owner', 'pay', 'people', 'product', 'question', 'sale', 'service', 'small', 'start', 'tax', 'thing', 'time', 'use', 'wa', 'want', 'way', 'website', 'work', 'x200b', 'year']


`x200b` value is a formatting feature, that I will drop.

`don` is super weired.

In [221]:
text.ENGLISH_STOP_WORDS

add_stop_words = ['don', 'ha', 'x200b', 'wa', 'just']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [222]:
def common_words(i):
    count_vect = CountVectorizer(analyzer = "word", 
                                 tokenizer = None, 
                                 preprocessor = None,
                                 stop_words = stop_words, 
                                 max_features = 40) 

    # input for CountVectorizer is an array of strings
    vector_input = df[df['target'] == i]['alltext']

    # fit_transform the vectorizer
    words = count_vect.fit_transform(vector_input)

    # convert output to a Numpy array
    words = words.toarray()
    return  count_vect.get_feature_names()

In [223]:
startup_words = common_words(1)
print(startup_words)

['app', 'business', 'company', 'customer', 'experience', 'feedback', 'founder', 'good', 'help', 'idea', 'job', 'know', 'like', 'looking', 'make', 'market', 'marketing', 'money', 'month', 'need', 'new', 'people', 'product', 'question', 'really', 'service', 'share', 'start', 'startup', 'thing', 'think', 'time', 'use', 'user', 'want', 'way', 'website', 'work', 'working', 'year']


In [224]:
smb_words = common_words(0)
print(smb_words)

['advice', 'business', 'client', 'company', 'customer', 'day', 'employee', 'good', 'help', 'idea', 'know', 'like', 'looking', 'make', 'marketing', 'money', 'month', 'need', 'new', 'online', 'owner', 'pay', 'people', 'product', 'question', 'really', 'sale', 'service', 'small', 'start', 'tax', 'thanks', 'thing', 'time', 'use', 'want', 'way', 'website', 'work', 'year']
