In [1]:
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag.perceptron import PerceptronTagger
from nltk.corpus import stopwords
import re

In [2]:
def remove_stops(data_str):
    # expects a string
    stops = set(stopwords.words("english"))
    list_pos = 0
    cleaned_str = ''
    text = data_str.split()
    for word in text:
        if word not in stops:
            # rebuild cleaned_str
            if list_pos == 0:
                cleaned_str = word
            else:
                cleaned_str = cleaned_str + ' ' + word
            list_pos += 1
    return cleaned_str

def tag_and_remove(data_str):
    cleaned_str = ' '
    # noun tags
    nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
    # adjectives
    jj_tags = ['JJ', 'JJR', 'JJS']
    # verbs
    vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    #adverbs
    av_tags = ['RB','RBR','RBS']
    nltk_tags = nn_tags + jj_tags + vb_tags + av_tags

    # break string into 'words'
    text = data_str.split()

    # tag the text and keep only those with the right tags
    #tagged_text = pos_tag(text)
    tagged_text = tagger.tag(text)
    for tagged_word in tagged_text:
        if tagged_word[1] in nltk_tags:
            cleaned_str += tagged_word[0] + ' '

    return cleaned_str

def lemmatize(data_str):
    # expects a string
    list_pos = 0
    cleaned_str = ''
    lmtzr = WordNetLemmatizer()
    text = data_str.split()
    #tagged_words = pos_tag(text)
    tagged_words = tagger.tag(text)
    for word in tagged_words:
        if 'v' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='v')
        elif 'r' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='r')
        elif 'a' in word[1].lower():
            lemma = lmtzr.lemmatize(word[0], pos='a')
        else:
            lemma = lmtzr.lemmatize(word[0], pos='n')
        if list_pos == 0:
            cleaned_str = lemma
        else:
            cleaned_str = cleaned_str + ' ' + lemma
        list_pos += 1
    return cleaned_str

In [3]:
hockey = pd.read_csv('additional_corrections.csv')

In [4]:
hockey.head()

Unnamed: 0.1,Unnamed: 0,tweet_1,team
0,0,time retweet and follow to enter if buch scor...,['nyr']
1,1,which canadiens are untouchable beside mete price,['habs']
2,2,lets do that hockey,['wild']
3,3,ed belfour,['stars']
4,4,happy birthday ryan grim reaper reaves,['pens']


In [5]:
hockey.drop('Unnamed: 0', 1, inplace=True)

In [6]:
hockey.head()

Unnamed: 0,tweet_1,team
0,time retweet and follow to enter if buch scor...,['nyr']
1,which canadiens are untouchable beside mete price,['habs']
2,lets do that hockey,['wild']
3,ed belfour,['stars']
4,happy birthday ryan grim reaper reaves,['pens']


In [8]:
hockey['tweet_1'] = hockey['tweet_1'].astype('str')

In [9]:
hockey['stop_text'] = hockey['tweet_1'].apply(remove_stops)

In [11]:
hockey[['tweet_1','stop_text']]

Unnamed: 0,tweet_1,stop_text
0,time retweet and follow to enter if buch scor...,time retweet follow enter buch scores goal vs ...
1,which canadiens are untouchable beside mete price,canadiens untouchable beside mete price
2,lets do that hockey,lets hockey
3,ed belfour,ed belfour
4,happy birthday ryan grim reaper reaves,happy birthday ryan grim reaper reaves
5,a big weekend for minnesota sport fans today i...,big weekend minnesota sport fans today st clou...
6,the learn to play program is back this season ...,learn play program back season details
7,happy to the best penguins fans we know cred,happy best penguins fans know cred
8,g canucks at oilers the first saturday withou...,g canucks oilers first saturday without red fi...
9,its game day avalanche colorado pm pregame ...,game day avalanche colorado pm pregame report


In [12]:
tagger = PerceptronTagger()
hockey['tag_text'] = hockey['stop_text'].apply(tag_and_remove)

In [13]:
hockey['lemm_text'] = hockey['tag_text'].apply(lemmatize)

In [15]:
hockey[['tweet_1','team']].to_csv('no_removal_stops_tweets.csv')

In [16]:
hockey[['stop_text','team']].to_csv('removal_stops_tweets.csv')

In [17]:
hockey[['lemm_text','team']].to_csv('lemmatized_tweets.csv')