## Data Cleaning 

In [1]:
# import librariesy 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
%matplotlib inline

In [37]:
raw_df = pd.read_csv('~/Downloads/stack-overflow-bqresults.csv')

In [11]:
raw_df.head()

Unnamed: 0,id,title,body,tags
0,11227902,,<p><strong>You are a victim of <a href= //en.w...,
1,11227809,Why is processing a sorted array faster than a...,<p>Here is a piece of C++ code that seems very...,java|c++|performance|optimization|branch-predi...
2,33617500,,<p><strong><a href= http://meta.stackoverflow....,
3,179147,,<h1>Amending the most recent commit message</h...,
4,29479702,,<p><strong><a href= http://meta.stackoverflow....,


In [12]:
raw_df.shape

(250000, 4)

In [13]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 4 columns):
id       250000 non-null int64
title    80283 non-null object
body     250000 non-null object
tags     80284 non-null object
dtypes: int64(1), object(3)
memory usage: 7.6+ MB


In [38]:
# 80284 non-null types and title out of 250000 entries
# remove observations without tags or title 
raw_df.dropna(inplace=True)

In [18]:
raw_df.head()

Unnamed: 0,id,title,body,tags
1,11227809,Why is processing a sorted array faster than a...,<p>Here is a piece of C++ code that seems very...,java|c++|performance|optimization|branch-predi...
5,927358,How do you undo the last commit?,<p>I committed the wrong files to Git.</p> <p>...,git|git-commit|git-reset|git-revert
11,179123,Edit an incorrect commit message in Git,<p>I wrote the wrong thing in a commit message...,git|git-commit|git-rewrite-history|amend
12,2003505,Delete a Git branch both locally and remotely,<p>I want to delete a branch both locally and ...,git|github|git-branch|git-remote
21,477816,What is the correct JSON content type?,<p>I've been messing around with <a href= http...,json|content-type


In [102]:
# make remove html tags from body
raw_df['body'] = raw_df.body.apply(lambda x: BeautifulSoup(x).get_text())

In [103]:
raw_df.head()

Unnamed: 0,id,title,body,tags
1,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...,java|c++|performance|optimization|branch-predi...
5,927358,How do you undo the last commit?,I committed the wrong files to Git. How can I ...,git|git-commit|git-reset|git-revert
11,179123,Edit an incorrect commit message in Git,I wrote the wrong thing in a commit message. H...,git|git-commit|git-rewrite-history|amend
12,2003505,Delete a Git branch both locally and remotely,I want to delete a branch both locally and on ...,git|github|git-branch|git-remote
21,477816,What is the correct JSON content type?,I've been messing around with JSON for some ti...,json|content-type


In [104]:
# check for unique values for tags
raw_df.tags.nunique()  # 56474 unique tags out of 80284 total observastions 

56474

In [67]:
temp_df = pd.DataFrame(pd.concat([pd.Series(row['id'], row['tags'].split('|'))
           for _, row in raw_df.iterrows()])).reset_index()

In [68]:
temp_df.rename(columns = {'index':'tags',0:'id'}, inplace = True)

In [105]:
split_df = pd.merge(temp_df, raw_df,on='id')

In [107]:
split_df.drop(columns = ['tags_y'], inplace=True)

In [108]:
split_df.rename(columns={'tags_x':'tags'}, inplace = True)

In [109]:
split_df.head()

Unnamed: 0,tags,id,title,body
0,java,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
1,c++,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
2,performance,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
3,optimization,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...
4,branch-prediction,11227809,Why is processing a sorted array faster than a...,Here is a piece of C++ code that seems very pe...


In [117]:
split_df.tags.value_counts(normalize=True, ascending=False)[:10]

java          0.031333
c#            0.029692
javascript    0.028851
android       0.023738
python        0.021103
c++           0.016412
.net          0.013922
jquery        0.013698
html          0.011482
php           0.011295
Name: tags, dtype: float64

In [118]:
for tag in split_df.tags.unique():
    print(tag)

java
c++
performance
optimization
branch-prediction
git
git-commit
git-reset
git-revert
git-rewrite-history
amend
github
git-branch
git-remote
json
content-type
git-pull
git-fetch
javascript
scope
closures
c
operators
code-formatting
standards-compliance
jquery
redirect
memory-management
language-agnostic
stack
heap
python
iterator
generator
yield
coroutine
version-control
git-stage
string
string-matching
syntax
jslint
use-strict
dom
visibility
angularjs
design
date
timezone
c++-faq
html
background-color
function
idioms
security
http
authentication
article
flex
actionscript
soap
coldfusion
wsdl
equality
equality-operator
identity-operator
branch
android
android-layout
user-interface
units-of-measurement
c#
algorithm
complexity-theory
computer-science
big-o
time-complexity
comments
methods
parameter-passing
pass-by-reference
pass-by-value
git-checkout
remote-branch
css
cross-browser
highlighting
textselection
git-merge
merge-conflict-resolution
git-conflict-resolution
oop
metaclass
pyth

angularjs-controller
gdb
dynamic-allocation
capslock
capitalization
text-styling
python-imaging-library
pillow
.net-2.0
rbenv
multiprocessing
jackson
overloading
manpage
onbeforeunload
tutorials
utf-16
utf
utf-32
slug
restful-url
xslt
xslt-1.0
xslt-2.0
url-routing
cross-reference
onchange
formula
subset
android-screen-support
cpu-architecture
text-processing
nullable
unboxing
phpdoc
hint
python-bin
bundle
control-flow
inverse
nsdate
nsdatecomponents
nscalendar
startup
justify
late-static-binding
opengl
gpu
w3c-validation
web-standards
persistent-connection
web-performance
g1gc
gson
record
stdmap
breadcrumbs
unchecked
git-extensions
cordova
cordova-cli
inline
one-definition-rule
media-queries
identity
mysql-workbench
eclipse-rcp
software-engineering
apache-config
etag
mod-expires
numeric-limits
numeric-conversion
google-chrome-extension
chromecast
image-resizing
trigonometry
touch
locale
sandbox
traceback
normalization
listbox
dbcontext
core-graphics
french
lines
debug-symbols
wraps
hex

machine-instruction
vssettings
array-merge
findviewbyid
android-identifiers
apply
generic-constraints
yui
jlabel
bufferedinputstream
verify
rows
microservices
word-diff
everyauth
poker
postgresql-json
jsonb
overload-resolution
processstartinfo
failover
booksleeve
hadoop2
nativelibrary
nested-resources
field-with-errors
fragmentation
web-api
whois
wiki
rabbitmqctl
http-status-code-405
divide-by-zero
custom-controls
conditional-execution
windows-desktop-gadgets
file-sharing
javafx
swingx
ivalidatableobject
yeoman
windows-firewall
css-specificity
process.start
rollback
application-pool
windbg
sos
zipcode
postal-code
tabcontrol
tabpage
binary-reproducibility
selectlist
usb
unauthorized
language-detection
sitemap
robots.txt
negative-number
internal-representation
dispatcher
fold
zipper
human-readable
testunit
has-and-belongs-to-many
payment
conditional-statements
tinymce
lazy-loading
leading-zero
css-frameworks
appdomain
data-mining
ansible
tel
enterprise-distribution
protect-from-forgery
m

assistive-technology
xcconfig
angularjs-ng-change
iphone-vibrate
file-exists
pascal
delphi-xe6
global-temp-tables
forward
xperf
dynamic-compilation
uart
jquery-cookie
warp-scheduler
realbasic
wt
git-reflog
reflog
date-conversion
apple-watch
ssis
debian-based
this-pointer
referenceequals
uidevice
platform
callblocking
incoming-call
html5-data
carousel
flask-sqlalchemy
js-scrollintoview
facebook-fql
resteasy
sql-server-config-manager
configuration-management
chef
puppet
cfengine
hdfs
lockscreen
ip-geolocation
iserializable
hardware
virtual-serial-port
jscience
redux-saga
simulator
realm
checkin
ipod
waveform
divide-and-conquer
pivot-without-aggregate
path-manipulation
application-restart
platform-agnostic
springboard
django-1.7
globbing
aio
charles
fields-for
moniker
cancellationtokensource
filesplitting
voting
eclipse-kepler
popup-blocker
blueimp
excel-2003
asp.net-web-api2
google-toolbox-for-mac
tablename
angular-strap
olap
oltp
simplemembership
io.js
epoll
io-completion-ports
initiali

truezip
appcmd
doublebuffered
django-custom-user
getchar
make-shared
flow-control
gsp
sitemesh
rakefile
source-control-explorer
chunked-encoding
windows-phone-7.1
xap
longest-prefix
vendor
tree-balancing
clicklistener
fpga
nodemon
weka
lzma
tinyurl
document.write
httpentity
tablecolumn
segments
embedded-linux
oracle-xe
joomla3.0
frontpage
semantic-diff
sdwebimage
spork
javassist
google-translate
numberpicker
magicalrecord
expando
mkpinannotationview
twitter4j
strtol
objdump
picturebox
catalyst
haskell-stack
external-links
languagetool
qtwebkit
cglayer
validates-uniqueness-of
database-integrity
preemption
subsystem
windows-nt
machinist
html5-filesystem
slash
caemitterlayer
prolog-dif
listpreference
ignore-case
self-tracking-entities
key-management
hotdeploy
name-collision
stack-unwinding
squid
rubular
viewdidload
panoramas
redbean
window-handles
haskell-mode
copytree
perl-io
nexus-7
richfaces
dialog-preference
webshim
tempdir
error-code
overlapping-matches
r-tree
mkstemp
fdopen
singleto

# should be able to merge a few tags and use common sense to make them into the same tag category
