# <center> Text preprocessing </center>

## Necessary downloads and library imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
project_path = '/content/drive/My Drive/Colab Notebooks/MATF_ML_project/'

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

from wordcloud import WordCloud

from bs4 import BeautifulSoup
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import regex as re
import string

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
!pip install lxml

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

We will turn off the detection of missing value markers due to the presence of "null" tags that are otherwise recognised as NA values. There are no true NA values in the data.

In order to avoid computation constraints with training and evaluation we will use a small subset of the data. Only 10,000 instances.

In [5]:
data = pd.read_csv(project_path + 'data/Train.csv',
                   na_filter=False, nrows=10000)

## Duplicates in data

In [6]:
data_without_duplicates = data.drop_duplicates(['Title', 'Body', 'Tags'])

In [7]:
print("Shape with duplicates\t", data.shape)
print("Shape without duplicates", data_without_duplicates.shape)

Shape with duplicates	 (10000, 4)
Shape without duplicates (9997, 4)


In [8]:
data = data_without_duplicates

### Drop rare tags

In order to avoid computation constraints with training and evaluation of the data, we will limit our tag prediction to the top 100 most frequent tags.

In [9]:
N_tags = 100

In [10]:
tags_per_question = list(map(lambda tags: tags.split(' '), data["Tags"]))
all_tags = [item for sublist in tags_per_question for item in sublist]
all_tags = np.array(all_tags)
print(len(np.unique(all_tags)))
print("Tags:", all_tags)

6124
Tags: ['php' 'image-processing' 'file-upload' ... 'haskell' '.net'
 'entity-framework-4']


In [11]:
unique, counts = np.unique(all_tags, return_counts=True)
tag_counts = dict(zip(unique, counts))

In [12]:
list(tag_counts.items())[:10]

[('.htaccess', 40),
 ('.net', 302),
 ('.net-2.0', 6),
 ('.net-3.5', 7),
 ('.net-4.0', 10),
 ('.net-4.5', 3),
 ('.net-assembly', 3),
 ('.net-framework', 1),
 ('.net4.0', 1),
 ('.refresh', 1)]

In [13]:
tag_counts_sorted = dict(sorted(tag_counts.items(), key=lambda item: item[1], reverse=True))

In [14]:
print('Total number of tags is', len(tag_counts_sorted))

Total number of tags is 6124


In [15]:
print("Most frequent tags:")
print(list(tag_counts_sorted.keys())[:20])

Most frequent tags:
['c#', 'java', 'php', 'javascript', 'android', 'jquery', 'c++', 'asp.net', '.net', 'iphone', 'python', 'html', 'mysql', 'sql', 'ios', 'css', 'linux', 'ruby-on-rails', 'objective-c', 'c']


In [16]:
print("Rarest tags:")
print(list(tag_counts_sorted.keys())[-20:])

Rarest tags:
['z-order', 'z3', 'zebra-printers', 'zedgraph', 'zenbook', 'zend-amf', 'zend-auth', 'zend-controller-router', 'zend-debugger', 'zend-form', 'zend-log', 'zend-paginator', 'zend-route', 'zend-search-lucene', 'zend-server', 'zend-server-ce', 'zepto', 'zigbee', 'zooming', 'zxing']


In [17]:
tags_most_frequent = list(tag_counts_sorted.keys())[:N_tags]

In [18]:
print("Top", N_tags, "tags:")
print(tags_most_frequent)

Top 100 tags:
['c#', 'java', 'php', 'javascript', 'android', 'jquery', 'c++', 'asp.net', '.net', 'iphone', 'python', 'html', 'mysql', 'sql', 'ios', 'css', 'linux', 'ruby-on-rails', 'objective-c', 'c', 'windows', 'ruby', 'sql-server', 'xml', 'wpf', 'database', 'ajax', 'asp.net-mvc', 'arrays', 'regex', 'xcode', 'facebook', 'osx', 'windows-7', 'performance', 'multithreading', 'networking', 'vb.net', 'ruby-on-rails-3', 'eclipse', 'actionscript-3', 'linq', 'html5', 'django', 'algorithm', 'json', 'flash', 'visual-studio-2010', 'string', 'wcf', 'oracle', 'bash', 'entity-framework', 'winforms', 'sql-server-2008', 'asp.net-mvc-3', 'ubuntu', 'silverlight', 'ipad', 'email', 'query', 'hibernate', 'image', 'web-services', 'wordpress', 'cocoa-touch', 'r', 'git', 'spring', 'apache', 'cocoa', 'visual-studio', 'homework', 'flex', 'apache2', 'calculus', 'excel', 'real-analysis', '.htaccess', 'codeigniter', 'forms', 'events', 'tsql', 'api', 'http', 'security', 'file', 'jquery-ui', 'sql-server-2005', 'per

Drop rare tags from questions:

In [19]:
new_data_tags = []
for tags in data['Tags']:
    new_tags = [tag for tag in tags.split(" ") if tag in tags_most_frequent]
    if not new_tags:
        new_tags=None
    new_data_tags.append(new_tags)

In [20]:
data['Tags'] = new_data_tags

In [21]:
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,1,How to check if an uploaded file is an image w...,<p>I'd like to check if an uploaded file is an...,[php]
1,2,How can I prevent firefox from closing when I ...,"<p>In my favorite editor (vim), I regularly us...",[firefox]
2,3,R Error Invalid type (list) for variable,<p>I am import matlab file and construct a dat...,[r]
3,4,How do I replace special characters in a URL?,"<p>This is probably very simple, but I simply ...",[c#]
4,5,How to modify whois contact details?,<pre><code>function modify(.......)\n{\n $mco...,"[php, api]"


In [22]:
data.shape

(9997, 4)

Drop instances that left without tags:

In [23]:
data = data.dropna()

In [24]:
data.shape

(7628, 4)

## Text preprocessing

In [25]:
test_str = "<body><p>There're some files: index.html, my.file.txt, file.c, file.ext-with-dash and I hope toktok-tokenizer will properly tokenize it.\n New line. This is new sentence. The most common tag is C#.... ++++++++++++++++++  ======================== R and C++ and C++11 and C aren't so frequent.<br> Some words with numbers: word1, word32, word-2.</p></body> There are some functions: main(), str(), f().<br>There are some urls: https://www.google.com https://www.google.com/dir/1/2/search.html?arg=0-a&arg1=1-b&arg3-c#hash https://google.us.edi?34535/534534?dfg=g&fg. Some numbers:  0123, 2021, 30912, 0000"

#### Removing HTML tags

We choose `lxml` believing it to be faster and more robust than the default one (```html.parser```).

In [26]:
def remove_html(s):
    return BeautifulSoup(s, 'lxml').get_text()

In [27]:
test_str = remove_html(test_str)
test_str



#### Cleaning text

Lowering text and transforming abbreviations.

In [28]:
file = open(project_path + "data/abbr.pkl", "rb")
abbr_dict = pickle.load(file)
file.close()

In [29]:
list(abbr_dict.items())[:10]

[("what's", 'what is'),
 ("what're", 'what are'),
 ("who's", 'who is'),
 ("who're", 'who are'),
 ("where's", 'where is'),
 ("where're", 'where are'),
 ("when's", 'when is'),
 ("when're", 'when are'),
 ("how's", 'how is'),
 ("how're", 'how are')]

In [30]:
def clean_text(s, abbr_dict=None):
    s = s.lower()
    
    if abbr_dict is None:
      return s

    for patt, repl in abbr_dict.items():
        s = re.sub(patt, repl, s)
    return s

In [31]:
test_str = clean_text(test_str, abbr_dict)
test_str



#### Tokenization

We examined the following tokenizers on our data:
* ```nltk.tokenize.word_tokenize()```
* ```nltk.tokenize.WhitespaceTokenizer()```
* ```nltk.tokenize.ToktokTokenizer()```
* ```nltk.tokenize.TweetTokenizer()```
* ```nltk.tokenize.treebank.TreebankWordTokenizer()```

and TokTokTokenizer seemed most applicable.

In [32]:
word_tokenizer = nltk.tokenize.ToktokTokenizer()

In [33]:
def tokenize_text(s, word_tokenizer):
    sentences = nltk.tokenize.sent_tokenize(s)
    # suming lists, tokenize_sents returns list(list(str))
    return sum(word_tokenizer.tokenize_sents(sentences), [])

In [34]:
test_tokens = tokenize_text(test_str, word_tokenizer)
print(test_tokens)



#### Process URLs

We firstly considered extracting the URL into meaningful parts (host, path, query, ...) but it turned out that meaningless parts were dominant in number and that created noise in data.
So we decided to remove URLs at all.

In [35]:
def remove_urls(tokens):
    new_tokens = []
    url_pattern = r"^((http[s]?|ftp):\/)?\/?([^:\/\s]*)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$"
    for token in tokens:
        if not re.match(url_pattern, token):
            new_tokens.append(token)
    return new_tokens
    

In [36]:
test_tokens = remove_urls(test_tokens)
print(test_tokens)



#### Process extensions

File extensions are very important features because there are also file extensions as tags!

In [37]:
def extract_extensions(tokens):
    new_tokens = []
    for token in tokens:
        token = re.sub(r"([a-zA-Z0-9])\.([a-zA-Z0-9-])", r'\1 .\2', token)
        for new_token in token.split(" "):
            new_tokens.append(new_token)
    return new_tokens

In [38]:
test_tokens = extract_extensions(test_tokens)
print(test_tokens)



#### Punctuation

Removing punctuation but keeping:
* tokens like 'c#', 'c++', ...
* dots in file extensions (this significantly improved the metrics value!)

In [39]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
punctuation_without_dash = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'

In [41]:
def extension(word):
    if len(word) < 2:
        return False
    if word[0] != '.':
        return False
    for letter in word[1:]:
        if letter in punctuation_without_dash:
            return False
    return True

In [42]:
def remove_punctuation(tokens):
    new_tokens = []
    for token in tokens:
        if not extension(token):
            while len(token) != 0 and token[0] in string.punctuation:
                token = token[1:]
            while len(token) != 0 and token[-1] == '(':
                token = token[:-1]
        if len(token) != 0:
            new_tokens.append(token)
    return new_tokens

In [43]:
test_tokens = remove_punctuation(test_tokens)
print(test_tokens)

['there', 'are', 'some', 'files', 'index', '.html', 'my', '.file', '.txt', 'file', '.c', 'file', '.ext-with-dash', 'and', 'i', 'hope', 'toktok-tokenizer', 'will', 'properly', 'tokenize', 'it', 'new', 'line', 'this', 'is', 'new', 'sentence', 'the', 'most', 'common', 'tag', 'is', 'c#', 'r', 'and', 'c++', 'and', 'c++11', 'and', 'c', 'are', 'not', 'so', 'frequent', 'some', 'words', 'with', 'numbers', 'word1', 'word32', 'word-2', 'there', 'are', 'some', 'functions', 'main', 'str', 'f', '.there', 'are', 'some', 'urls', 'some', 'numbers', '0123', '2021', '30912', '0000']


#### Stopwords

There are tags like 'design' so we think that tokens like 'why', 'how', 'what' are important (e.g. 'how' is maybe important for tag 'design') and we will not remove them. Also, we will not remove single-letter tokens (for example 'o').

In [44]:
stopwords_eng = stopwords.words('english')

In [45]:
print(stopwords_eng)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [46]:
not_stopwords = ['what', 'which', 'who', 'about', 'where', 'why', 'how',
                  'no', 'not', 'on', 'off', 'o', 're', 'y']

In [47]:
my_stopwords = [word for word in stopwords_eng if word not in not_stopwords]
# my_stopwords

In [48]:
def remove_stopwords(tokens, stopwords):
    return [token for token in tokens if token not in stopwords]

In [49]:
test_tokens = remove_stopwords(test_tokens, my_stopwords)
print(test_tokens)

['files', 'index', '.html', '.file', '.txt', 'file', '.c', 'file', '.ext-with-dash', 'hope', 'toktok-tokenizer', 'properly', 'tokenize', 'new', 'line', 'new', 'sentence', 'common', 'tag', 'c#', 'r', 'c++', 'c++11', 'c', 'not', 'frequent', 'words', 'numbers', 'word1', 'word32', 'word-2', 'functions', 'main', 'str', 'f', '.there', 'urls', 'numbers', '0123', '2021', '30912', '0000']


#### Numbers

To reduce the number of useless tokens we will reduce the numbers in text.
However, we will not completely eliminate them because they are important for questions from mathematical topics. We will replace number with one digit. 

In order not to lose the years from the text, which are also important, because they are often found in the names of software versions for example, we will save all the numbers between 1900 and 2100. 

To make sure of that, later in the notebook we will count occurrences of some years in the text.

NOTE: There is a lot of room for improvement in this approach. 

In [50]:
def process_numbers(tokens):
    new_tokens = []
    for token in tokens:
        while len(token) != 0 and token[0] == '0':
            token = token[1:]

        if token.isdigit() and len(token) == 4 and (int(token) > 1900 and int(token) < 2100): # token is PROBABLY a year
            new_tokens.append(token)
        elif token.isdigit():
            new_tokens.append(token[0])
        elif len(token) != 0:
            new_tokens.append(token)
    return new_tokens

In [51]:
test_tokens = process_numbers(test_tokens)
print(test_tokens)

['files', 'index', '.html', '.file', '.txt', 'file', '.c', 'file', '.ext-with-dash', 'hope', 'toktok-tokenizer', 'properly', 'tokenize', 'new', 'line', 'new', 'sentence', 'common', 'tag', 'c#', 'r', 'c++', 'c++11', 'c', 'not', 'frequent', 'words', 'numbers', 'word1', 'word32', 'word-2', 'functions', 'main', 'str', 'f', '.there', 'urls', 'numbers', '1', '2021', '3']


#### Stemmer

We examined the following stemmers on our data:
* `nltk.stem.PorterStemmer(language='english')` - the most gentle one
* `nltk.stem.SnowballStemmer(language='english')` - improvement over porter
* `nltk.stem.lancaster.LancasterStemmer(language='english')` - the fastest but very agressive

We choose Snowball Stemmer.

We also examined lemmatizing but it was unacceptably slow.

In [52]:
stemmer = nltk.stem.SnowballStemmer(language='english')

In [53]:
def stem_tokens(tokens, stemmer):
    return [stemmer.stem(token) for token in tokens]

In [54]:
test_tokens = stem_tokens(test_tokens, stemmer)
print(test_tokens)

['file', 'index', '.html', '.file', '.txt', 'file', '.c', 'file', '.ext-with-dash', 'hope', 'toktok-token', 'proper', 'token', 'new', 'line', 'new', 'sentenc', 'common', 'tag', 'c#', 'r', 'c++', 'c++11', 'c', 'not', 'frequent', 'word', 'number', 'word1', 'word32', 'word-2', 'function', 'main', 'str', 'f', '.there', 'url', 'number', '1', '2021', '3']


### Final preprocessing

In [55]:
def preprocess_text(text, tokenize=True, word_tokenizer=None, html=True, clean=True, abbr_dict = None, urls=True, extensions=True, 
                    punctuation=True, stopwords=True, stopword_list=None,
                    numbers=True, stem=True, stemmer=None):
    if html:
        text = remove_html(text)

    if abbr_dict is None:
        abbr_dict = {}

    if clean:
        text = clean_text(text, abbr_dict)
        
    if tokenize:
        if word_tokenizer is None:
            nltk.tokenize.ToktokTokenizer()
        tokens = tokenize_text(text, word_tokenizer)
    else:
        tokens = text
        
    if urls:
        tokens = remove_urls(tokens)

    if extensions:
        tokens = extract_extensions(tokens)

    if punctuation:
        tokens = remove_punctuation(tokens)
        
    if stopword_list is None:
        stopword_list = []
    
    if stopwords:
        tokens = remove_stopwords(tokens, stopword_list)
        
    if numbers:
        tokens = process_numbers(tokens)

    if stem:
        if stemmer is None:
            stemmer = nltk.stem.SnowballStemmer(language='english')
        tokens = stem_tokens(tokens, stemmer)
        
    return tokens

In [56]:
body_tokens = data['Body'].apply(lambda x : preprocess_text(x, word_tokenizer=word_tokenizer, stopword_list=my_stopwords, stemmer=stemmer))

In [57]:
title_tokens = data['Title'].apply(lambda x : preprocess_text(x, word_tokenizer=word_tokenizer, stopword_list=my_stopwords, stemmer=stemmer))

## Analyse all tokens

In [58]:
all_tokens = []
for tok in body_tokens:
    for t in tok:
        all_tokens.append(t)

In [59]:
len(all_tokens) 

722510

In [60]:
t, c = np.unique(all_tokens, return_counts=True)
unique_tokens = dict(zip(list(t), list(c)))

In [61]:
print("The number of unique tokens: ", len(unique_tokens))

The number of unique tokens:  63184


In [62]:
print("Average token count:", np.average(list(unique_tokens.values())))

Average token count: 11.435015193719929


In [63]:
less_then = 3
rare_tokens = [t for t, c in unique_tokens.items() if c < less_then]
print("Tokens that appear less then", less_then, "times:")
print("====================================")
print("Size: ", len(rare_tokens), "Sample: ", rare_tokens[:50])

Tokens that appear less then 3 times:
Size:  43950 Sample:  ['&\\mbox{', '&amp', '*', '*/', '+', '+$', '+dfsg-8~bpo60+1', ',0,0,0,0,0,0,0,0', ',0,0,10,28', ',0,1,0', ',0,1,0,0,0,0,0', ',0,1,1', ',0,2,0,1,0,0,0', ',0,20,0', ',0,282,210', ',0,320,480', ',0,5,0', ',0,95,34', ',0-1,1', ',1,2', ',1,2,2,1,0,0,0', ',10', ',10,0,0', ',100', ',11,12,13,14,15,7,8', ',15,0,0', ',15,30,45', ',15ms', ',18,17,22', ',2,1,1,2,1,1,0', ',2,2,2,2,0,0,0', ',2,4,6,8,10,12,14,16,18,20,22', ',20,0,0', ',200$', ',200,100,1', ',22', ',25ms', ',4', ',5,0,5', ',5,5', ',5,6,8', ',6', ',80,0,38', '-', '-11', '-1114111', '-128', '-17', '-255', '-3']


In [64]:
more_then = 500
very_frequent_tokens = [(t, c) for t, c in unique_tokens.items() if c > more_then]
print("Tokens that appear more then", more_then, "times:")
print("======================================")
print("Size: ", len(very_frequent_tokens), "Sample: ", very_frequent_tokens[:50])

Tokens that appear more then 500 times:
Size:  241 Sample:  [('.0', 965), ('.java', 1343), ('.net', 666), ('1', 11725), ('124;', 2462), ('2', 5919), ('3', 3649), ('4', 3057), ('5', 2880), ('6', 1435), ('7', 1235), ('8', 1065), ('9', 955), ('abl', 642), ('about', 1050), ('access', 741), ('activ', 564), ('ad', 627), ('add', 1528), ('also', 1151), ('amp;', 635), ('android', 2541), ('anoth', 663), ('anyon', 727), ('app', 1219), ('applic', 1637), ('array', 1347), ('b', 828), ('base', 549), ('button', 975), ('c', 1202), ('call', 1598), ('case', 685), ('chang', 1327), ('check', 816), ('class', 2360), ('class=', 977), ('click', 850), ('client', 618), ('close', 511), ('code', 3553), ('column', 729), ('connect', 1061), ('contain', 751), ('content', 697), ('context', 563), ('control', 888), ('correct', 597), ('could', 985), ('creat', 2159)]


In [65]:
normal_frequency_tokens = [t for t, c in unique_tokens.items() if c < more_then and c > less_then]
print("Tokens that appear between", less_then, "and", more_then, "times:")
print("==========================================")
print("Size: ", len(normal_frequency_tokens), "Sample: ", normal_frequency_tokens[:50])

Tokens that appear between 3 and 500 times:
Size:  13994 Sample:  ['$', '**', '***', '*16', ',0', ',0,0', ',0,0,0', ',0,0,0,0,0,0,0', ',000', ',1', ',1,2,3', ',2', '-00-00', '-06', '-1', '-5', '-9', '.', '.-', '.0-', '.00', '.000', '.0000', '.00000', '.000000', '.0000000', '.00000000', '.000000000', '.0000000000', '.000006', '.0000090', '.0004882812', '.0009765625', '.001', '.0010000', '.001953125', '.0019531250', '.00390625', '.003906250', '.004', '.007', '.0078125', '.00781250', '.0079', '.009', '.009342', '.01', '.010', '.014', '.015']


### Years in text

In [66]:
def count_occurances(s, tokens):
    count = 0
    for token in tokens:
        if token == s:
            count += 1
    return count

In [67]:
print("Num of occurances of '2008':", count_occurances("2008", all_tokens))
print("Num of occurances of '2010':", count_occurances("2010", all_tokens))

Num of occurances of '2008': 134
Num of occurances of '2010': 131


## Compare results with original

In [68]:
index=8

In [69]:
data["Body"][index]

'<p>Do you know of a .NET library for generating javascript code? </p>\n\n<p>I want to generate javascript code based on information in my .NET application. I would like to be able to create an AST-like datastructure (using C#) and have it turned into valid javascript. I need to be able to create functions, statements, expressions etc., so I need something more than a JSON serializer - but I guess you could think of this as a (<em>very</em>) generalized JSON serializer.</p>\n\n<p>Do such libraries exist and if so, could you recommend any?</p>\n\n<p>Thank you.</p>\n'

In [70]:
print(body_tokens[index])

['know', '.net', 'librari', 'generat', 'javascript', 'code', 'want', 'generat', 'javascript', 'code', 'base', 'on', 'inform', '.net', 'applic', 'would', 'like', 'abl', 'creat', 'ast-lik', 'datastructur', 'use', 'c#', 'turn', 'valid', 'javascript', 'need', 'abl', 'creat', 'function', 'statement', 'express', 'etc.', 'need', 'someth', 'json', 'serial', 'guess', 'could', 'think', 'general', 'json', 'serial', 'librari', 'exist', 'could', 'recommend', 'thank']


In [71]:
data["Title"][index]

'.NET library for generating javascript?'

In [72]:
print(title_tokens[index])

['.net', 'librari', 'generat', 'javascript']


## Save new data

In [73]:
# Suppress SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

data['Body'] = body_tokens
data['Title'] = title_tokens

pd.set_option('mode.chained_assignment', 'warn')

In [74]:
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,1,"[how, check, upload, file, imag, without, mime...","[like, check, upload, file, imag, file, e, .g,...",[php]
1,2,"[how, prevent, firefox, close, press, ctrl-w]","[favorit, editor, vim, regular, use, ctrl-w, e...",[firefox]
2,3,"[r, error, invalid, type, list, variabl]","[import, matlab, file, construct, data, frame,...",[r]
3,4,"[how, replac, special, charact, url]","[probabl, simpl, simpli, cannot, find, answer,...",[c#]
4,5,"[how, modifi, whoi, contact, detail]","[function, modifi, mcontact, file_get_cont, ui...","[php, api]"


In [75]:
file = open(project_path + "data/data_preprocessed.csv", "wb")
pickle.dump(data, file)
file.close()

## Preprocess Tags (for Heuristic)

In [76]:
preprocessed_tags = data['Tags'].apply(lambda x : preprocess_text(x, tokenize=False, html=False, clean=False, urls=False,
                                                                  extensions=False, punctuation=True,  stopwords=False, 
                                                                  numbers=False, stem=True, stemmer=stemmer))

In [77]:
data['Tags'][40]

['iphone', 'xcode']

In [78]:
preprocessed_tags[40]

['iphon', 'xcode']

In [79]:
file = open(project_path + "data/tags_preprocessed.csv", "wb")
pickle.dump(preprocessed_tags, file)
file.close()