# NLP: Stemming & Lemmatizing

## Imports

In [1]:
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer, SnowballStemmer

## Functions

In [2]:
def stem_lem_text(s, type='Lancaster'):
    words = s.split()
    
    if type == 'Porter':
        choice = PorterStemmer()
        reformed = [choice.stem(word) for word in words]
    elif type == 'Snowball':
        choice = SnowballStemmer('english')
        reformed = [choice.stem(word) for word in words]
    elif type == 'Lemmatize':
        choice = WordNetLemmatizer()
        reformed = [choice.lemmatize(word) for word in words]
    else:
        choice = LancasterStemmer()
        reformed = [choice.stem(word) for word in words]
        
    reformed = " ".join(reformed)
    return reformed

## Load Data

In [3]:
df = pd.read_csv('news_articles_cleaned_trimmed.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84155 entries, 0 to 84154
Data columns (total 6 columns):
article_text            84155 non-null object
datetime                84155 non-null object
source                  84155 non-null object
title                   84155 non-null object
url                     84155 non-null object
cleaned_article_text    84155 non-null object
dtypes: object(6)
memory usage: 3.9+ MB


In [5]:
df.sample(10)

Unnamed: 0,article_text,datetime,source,title,url,cleaned_article_text
31653,"FRANKFURT, July 5 (Reuters) - European stocks...",2007-07-05 00:00:00,Reuters,"European shares end down on Vodafone, rate wor...",http://uk.reuters.com/article/markets-europe-s...,European stocks led telecommunications heavywe...
57149,(Changes entry on Porsche to be filed under Vo...,2016-12-23,Reuters,REFILE-German stocks - Factors to watch on Dec...,http://uk.reuters.com/article/germany-stocks-f...,The DAX looked set according premarket brokera...
79304,As dawn breaks in Asia traders are readying th...,2017-02-14 23:00:00,FT,Fast Asia Open: Between two Yellens,https://www.ft.com/content/78392d38-8e4b-32d4-...,As dawn breaks Asia traders readying purgatory...
5334,\nMy father was born in 1919 and died in 2002....,2015-03-22 00:05:05,Guardian,Enough of the dry politics of numbers. We need...,https://www.theguardian.com/commentisfree/2015...,My father born died For first years life stock...
42951,Sept 4 (Reuters) - Hong Kong stock erased earl...,2015-09-04,Reuters,Hong Kong stocks fall on nervousness ahead of ...,http://uk.reuters.com/article/markets-hongkong...,Hong Kong stock erased early gains investors l...
21594,* FTSEurofirst 300 falls 0.5 pct; Euro STOXX 5...,2014-07-08 00:00:00,Reuters,Banks lead European shares lower as US fine co...,http://uk.reuters.com/article/markets-europe-s...,Banking stocks led European shares German bank...
54009,"TOKYO, Sept 16 (Reuters) - The Nikkei benchma...",2011-09-15,Reuters,"Nikkei set to rise, financials may outperform",http://uk.reuters.com/article/markets-japan-st...,The Nikkei benchmark may climb helped gains Wa...
55917,* Nikkei briefly pares gains after N.Korean n...,2009-05-25,Reuters,"Nikkei up 1.3 pct, shrugs off N. Korea nuclear...",http://uk.reuters.com/article/markets-japan-st...,Japan Nikkei stock mostly shrugged North Korea...
58650,"BANGKOK, Aug 5 (Reuters) - Indonesian shares e...",2014-08-05,Reuters,"SE Asia Stocks -Indonesian, Philippine shares ...",http://uk.reuters.com/article/markets-southeas...,Indonesian shares eased weaker expected econom...
12142,\nA political tussle over Indonesia's formidab...,2014-01-09 15:54:00,Guardian,Natural resources and sensible leaders bring h...,https://www.theguardian.com/world/2014/jan/09/...,A political tussle Indonesia formidable minera...


## Stem / Lemmatize Text

In [6]:
df['text_lancaster'] = df['cleaned_article_text'].apply(stem_lem_text, type='Lancaster')
df['text_porter'] = df['cleaned_article_text'].apply(stem_lem_text, type='Porter')
df['text_snowball'] = df['cleaned_article_text'].apply(stem_lem_text, type='Snowball')
df['text_lemmatize'] = df['cleaned_article_text'].apply(stem_lem_text, type='Lemmatize')

## Save Data

In [7]:
df.to_csv('news_articles_stemmed_lemmatized_no_extra_stops.csv', index=None, encoding='utf-8')