# Project 3 Appendix: Comparison of Lemmatization and Stemming Results

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import requests
import seaborn as sns
import string

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline

## Setup

In [3]:
# read in and assign dataframe
crypto_df = pd.read_csv('datasets/crypto.csv')
invest_df = pd.read_csv('datasets/invest.csv')

In [6]:
# concate dataframes along columns
combined_df = pd.concat(objs=[crypto_df, invest_df], axis=0)

# find dupicate posts (cross posts) between both subreddits
combined_df[combined_df.duplicated(['selftext'])] #no duplicate posts

Unnamed: 0.1,Unnamed: 0,subreddit,author,title,selftext,created_utc,full_link


In [7]:
# drop individual index columns and create new index
combined_df.reset_index(inplace=True, drop=True)
combined_df['subreddit'].value_counts()

CryptoCurrency    1012
investing         1004
Name: subreddit, dtype: int64

In [8]:
# concat title and selftext columns
combined_df["all_text"] = combined_df["title"] + combined_df["selftext"]

# Lemmatization vs Stemming (PorterStemmer and WordNetLemmatizer)

### Lemmatizer and Stemming Result Comparison

In [23]:
lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()
snow_stem = SnowballStemmer(language='english')

def lemstem(row):

    # Lemmatizing, Porter Stemming and Snowball Stemming
    row['lemma_text'] = [lemmatizer.lemmatize(tok) for tok in row['all_text']]
    row['porter_text'] = [p_stemmer.stem(tok) for tok in row['all_text']]
    row['snow_text'] = [snow_stem.stem(tok) for tok in row['all_text']] 
    
    return row

lem_df = combined_df.apply(lemstem, axis=1)

In [24]:
# comparison loop
for original, lemma, porter, snow in zip(
    lem_df['all_text'][0],
    lem_df['lemma_text'][0], 
    lem_df['porter_text'][0], 
    lem_df['snow_text'][0]):
    if (original != lemma) | ((lemma != porter) | (lemma != snow) | (porter != snow)):
        print(original, lemma, porter, snow)

binance binance binanc binanc
assume assume assum assum
title title titl titl
says say say say
couple couple coupl coupl
weeks week week week
especially especially especi especi
since since sinc sinc
started started start start
aggressive aggressive aggress aggress
marketing marketing market market
large large larg larg
volume volume volum volum
overall overall overal overal
presence presence presenc presenc
noticed noticed notic notic
different different differ differ
statistics statistic statist statist
different different differ differ
analysis analysis analysi analysi
websites website websit websit
difference difference differ differ
easy easy easi easi
statistical statistical statist statist
especially especially especi especi
accuracy accuracy accuraci accuraci
noticed noticed notic notic
difference difference differ differ
happening happening happen happen
sites site site site
remembered remembered rememb rememb
binance binance binanc binanc
purchased purchased purchas purchas
d

While not 100% accurate, using Lemmatization (resulting in the second word of each row) to normalise words seem to return the best outcome.