In [1]:
import nltk
import warnings
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
# load filtered df

filtered_df = pd.read_pickle('data/filtered_df.pkl')
    

In [3]:
filtered_df.head()

Unnamed: 0,id,title,body,tags
5,927358,How do you undo the last commit?,I committed the wrong files to Git. How can I ...,git
11,179123,Edit an incorrect commit message in Git,I wrote the wrong thing in a commit message. H...,git
12,2003505,Delete a Git branch both locally and remotely,I want to delete a branch both locally and on ...,git
23,292357,What are the differences between 'git pull' an...,What are the differences between git pull and ...,git
24,111102,How do JavaScript closures work?,How would you explain JavaScript closures to s...,javascript


### preprocesss:
* converting all letters to lower or upper case
* converting numbers into words or removing numbers
* removing white spaces
* removing punctuations, accent marks and other diacritics

In [4]:
def text_preprocess(text):
    '''
    input:
    post: a string with symbols and punctuations 
    returns:
    cleaned post with all letters to lower, all numbers, white space, and symbols removed
    '''
    pattern = r'[^A-Za-z]+'  # anything that is not letter or space
    processed = re.sub(pattern, ' ', text).strip().lower()
    return processed

In [5]:
# clean up the text column for text_target
filtered_df['cleaned_body'] = filtered_df.body.apply(text_preprocess)

In [7]:
filtered_df.loc[:,['body','cleaned_body']].head()

Unnamed: 0,body,cleaned_body
5,I committed the wrong files to Git. How can I ...,i committed the wrong files to git how can i u...
11,I wrote the wrong thing in a commit message. H...,i wrote the wrong thing in a commit message ho...
12,I want to delete a branch both locally and on ...,i want to delete a branch both locally and on ...
23,What are the differences between git pull and ...,what are the differences between git pull and ...
24,How would you explain JavaScript closures to s...,how would you explain javascript closures to s...


## tokenize and lemmatize 
* removing stop words, sparse terms, and particular words
* lemmatization 

In [8]:
# position tags function to word_net postition tag
def get_wordnet_pos(treebank_tag):
    '''
    input: 
    treebank_tag: position tag from treebank_tag from nltk.pos_tag output
    output:
    return:
    wordnet position tag 
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return

In [9]:
lemmatizer = WordNetLemmatizer()


def token_lemma(text):
    '''
    input:
    post: cleaned post from function text_preprocess
    returns:
    tokenized post with lemmatization with position tags
    stopwords and tags are removed 
    '''
    tokens = word_tokenize(text)
    # stopwords
    stop_words = set(stopwords.words('english'))  # make sure no repeats
    # remove stopwords and remove words that are explicit tags
    words_to_remove = set(filtered_df.tags.unique()).union(stop_words)
    # perform pos tag before stop word removal to include more context for pos tags
    tags = nltk.pos_tag(tokens)
    tags_word_net = [get_wordnet_pos(w[1]) for w in tags]
    lem_result = []  # only include nonstop words and target tags
    for i in range(len(tags_word_net)):
        if tags[i][0] in words_to_remove:  # don't lemmatize unneeded words
            continue
        if tags_word_net[i]:  # not none
            lem_result.append(lemmatizer.lemmatize(
                tags[i][0], tags_word_net[i]))
        else:
            lem_result.append(tags[i][0])
    return lem_result

In [10]:
# apply to text to tokenize and lemmatize
filtered_df['tokens'] = filtered_df.cleaned_body.apply(token_lemma)

In [11]:
filtered_df.head()

Unnamed: 0,id,title,body,tags,cleaned_body,tokens
5,927358,How do you undo the last commit?,I committed the wrong files to Git. How can I ...,git,i committed the wrong files to git how can i u...,"[commit, wrong, file, undo, commit]"
11,179123,Edit an incorrect commit message in Git,I wrote the wrong thing in a commit message. H...,git,i wrote the wrong thing in a commit message ho...,"[write, wrong, thing, commit, message, change,..."
12,2003505,Delete a Git branch both locally and remotely,I want to delete a branch both locally and on ...,git,i want to delete a branch both locally and on ...,"[want, delete, branch, locally, remote, projec..."
23,292357,What are the differences between 'git pull' an...,What are the differences between git pull and ...,git,what are the differences between git pull and ...,"[difference, pull, fetch]"
24,111102,How do JavaScript closures work?,How would you explain JavaScript closures to s...,javascript,how would you explain javascript closures to s...,"[would, explain, closure, someone, knowledge, ..."


In [13]:
# join the tokens into cleaned_text for feature engineering later
filtered_df['cleaned_body'] = filtered_df.tokens.apply(lambda x: ' '.join(x))

In [14]:
filtered_df.head()

Unnamed: 0,id,title,body,tags,cleaned_body,tokens
5,927358,How do you undo the last commit?,I committed the wrong files to Git. How can I ...,git,commit wrong file undo commit,"[commit, wrong, file, undo, commit]"
11,179123,Edit an incorrect commit message in Git,I wrote the wrong thing in a commit message. H...,git,write wrong thing commit message change messag...,"[write, wrong, thing, commit, message, change,..."
12,2003505,Delete a Git branch both locally and remotely,I want to delete a branch both locally and on ...,git,want delete branch locally remote project fork...,"[want, delete, branch, locally, remote, projec..."
23,292357,What are the differences between 'git pull' an...,What are the differences between git pull and ...,git,difference pull fetch,"[difference, pull, fetch]"
24,111102,How do JavaScript closures work?,How would you explain JavaScript closures to s...,javascript,would explain closure someone knowledge concep...,"[would, explain, closure, someone, knowledge, ..."


In [18]:
# save the data to a pickle file
filtered_df.to_pickle('data/cleaned_body.pkl')

In [19]:
pd.read_pickle('data/cleaned_body.pkl')

Unnamed: 0,id,title,body,tags,cleaned_body,tokens
5,927358,How do you undo the last commit?,I committed the wrong files to Git. How can I ...,git,commit wrong file undo commit,"[commit, wrong, file, undo, commit]"
11,179123,Edit an incorrect commit message in Git,I wrote the wrong thing in a commit message. H...,git,write wrong thing commit message change messag...,"[write, wrong, thing, commit, message, change,..."
12,2003505,Delete a Git branch both locally and remotely,I want to delete a branch both locally and on ...,git,want delete branch locally remote project fork...,"[want, delete, branch, locally, remote, projec..."
23,292357,What are the differences between 'git pull' an...,What are the differences between git pull and ...,git,difference pull fetch,"[difference, pull, fetch]"
24,111102,How do JavaScript closures work?,How would you explain JavaScript closures to s...,javascript,would explain closure someone knowledge concep...,"[would, explain, closure, someone, knowledge, ..."
31,1642028,What is the name of the --> operator?,After reading Hidden Features and Dark Corners...,c++,read hidden feature dark corner c stl comp lan...,"[read, hidden, feature, dark, corner, c, stl, ..."
36,503093,How can I make a page redirect using jQuery?,How can I redirect the user from one page to a...,javascript,redirect user one page another use jquery,"[redirect, user, one, page, another, use, jquery]"
40,231767,What does the yield keyword do in Python?,What is the use of the yield keyword in Python...,python,use yield keyword example try understand code ...,"[use, yield, keyword, example, try, understand..."
43,348170,Undo 'git add' before commit,I mistakenly added files using the command: gi...,git,mistakenly add file use command add myfile txt...,"[mistakenly, add, file, use, command, add, myf..."
45,1789945,How can I check if one string contains another...,How can I check if one string contains another...,javascript,check one string contain another substring usu...,"[check, one, string, contain, another, substri..."
